Louis BECQUEY

Merge branch 'master' of https://github.com/persalteas/RNANet into master

...@@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n ...@@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n
7 This script prepares the dataset from available public data in PDB and Rfam. 7 This script prepares the dataset from available public data in PDB and Rfam.
8 8
9 9
10 -**Please cite**: *Coming soon, expect it summer 2020* 10 +**Please cite**: *Coming soon, expect it in 2021*
11 11
12 # What it does 12 # What it does
13 The script follows these steps: 13 The script follows these steps:
...@@ -72,7 +72,7 @@ You need to install: ...@@ -72,7 +72,7 @@ You need to install:
72 72
73 ## Command line 73 ## Command line
74 Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`. 74 Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`.
75 -It requires solid hardware to run. It takes around 15 hours the first time, and 9h then, tested on a server with 32 cores and 48GB of RAM. 75 +It requires solid hardware to run. It takes around around 12 to 15 hours the first time, and 1 to 3h then, tested on a server with 32 cores and 48GB of RAM.
76 The detailed list of options is below: 76 The detailed list of options is below:
77 77
78 ``` 78 ```
......
...@@ -273,32 +273,39 @@ class Chain: ...@@ -273,32 +273,39 @@ class Chain:
273 if self.mapping is not None: 273 if self.mapping is not None:
274 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}") 274 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
275 275
276 - if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]: 276 + try:
277 - # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end 277 + if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
278 - 278 + # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
279 - if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup: 279 +
280 - # They are all contiguous in the chain 280 + if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
281 - # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618... 281 + # They are all contiguous in the chain
282 - # there is a redundancy (611) followed by a gap (611-617). 282 + # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
283 - # We want the redundancy to fill the gap. 283 + # there is a redundancy (611) followed by a gap (611-617).
284 - df.iloc[i:i+n_dup-1, 1] += 1 284 + # We want the redundancy to fill the gap.
285 + df.iloc[i:i+n_dup-1, 1] += 1
286 + else:
287 + # We solve the problem continous component by continuous component
288 + for j in range(1, n_dup+1):
289 + if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
290 + df.iloc[i+j-1,1] += 1
291 + else:
292 + break
293 + elif df.iloc[i,1] == df.iloc[i-1,1]:
294 + # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
295 + # Solution : we shift the numbering of 17A (to 18) and the following residues.
296 + df.iloc[i:, 1] += 1
285 else: 297 else:
286 - # We solve the problem continous component by continuous component 298 + # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
287 - for j in range(1, n_dup+1): 299 + # Here the 163B is read 163 by DSSR, but there already is a residue 163.
288 - if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous 300 + # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
289 - df.iloc[i+j-1,1] += 1 301 + df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
290 - else: 302 + df.iloc[i+1:, 1] += 1
291 - break 303 + except:
292 - elif df.iloc[i,1] == df.iloc[i-1,1]: 304 + warn(f"Error with parsing of {self.chain_label} duplicate residue numbers. Ignoring it.")
293 - # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. 305 + self.delete_me = True
294 - # Solution : we shift the numbering of 17A (to 18) and the following residues. 306 + self.error_messages = f"Error with parsing of duplicate residues numbers."
295 - df.iloc[i:, 1] += 1 307 + return None
296 - else: 308 +
297 - # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
298 - # Here the 163B is read 163 by DSSR, but there already is a residue 163.
299 - # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
300 - df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
301 - df.iloc[i+1:, 1] += 1
302 309
303 # Search for ligands at the end of the selection 310 # Search for ligands at the end of the selection
304 # Drop ligands detected as residues by DSSR, by detecting several markers 311 # Drop ligands detected as residues by DSSR, by detecting several markers
...@@ -1019,7 +1026,7 @@ class Pipeline: ...@@ -1019,7 +1026,7 @@ class Pipeline:
1019 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 1026 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &")
1020 sys.exit() 1027 sys.exit()
1021 elif opt == '--version': 1028 elif opt == '--version':
1022 - print("RNANet 1.0 alpha ") 1029 + print("RNANet 1.1 beta")
1023 sys.exit() 1030 sys.exit()
1024 elif opt == "-r" or opt == "--resolution": 1031 elif opt == "-r" or opt == "--resolution":
1025 assert float(arg) > 0.0 and float(arg) <= 20.0 1032 assert float(arg) > 0.0 and float(arg) <= 20.0
...@@ -1382,7 +1389,7 @@ class Pipeline: ...@@ -1382,7 +1389,7 @@ class Pipeline:
1382 # Remove previous precomputed data 1389 # Remove previous precomputed data
1383 subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) 1390 subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
1384 for f in self.fam_list: 1391 for f in self.fam_list:
1385 - subprocess.run(["rm","-f", f"data/{f}.npy"]) 1392 + subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
1386 1393
1387 # Run statistics files 1394 # Run statistics files
1388 os.chdir(runDir) 1395 os.chdir(runDir)
...@@ -1390,13 +1397,12 @@ class Pipeline: ...@@ -1390,13 +1397,12 @@ class Pipeline:
1390 subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data]) 1397 subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
1391 1398
1392 # Save additional informations 1399 # Save additional informations
1393 - conn = sqlite3.connect(runDir+"/results/RNANet.db") 1400 + with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
1394 - pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 1401 + pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
1395 - conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) 1402 + conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
1396 - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 1403 + pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
1397 - JOIN chain ON structure.pdb_id = chain.structure_id 1404 + JOIN chain ON structure.pdb_id = chain.structure_id
1398 - ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) 1405 + ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
1399 - conn.close()
1400 1406
1401 # Archive the results 1407 # Archive the results
1402 if self.SELECT_ONLY is None: 1408 if self.SELECT_ONLY is None:
...@@ -1404,7 +1410,10 @@ class Pipeline: ...@@ -1404,7 +1410,10 @@ class Pipeline:
1404 subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) 1410 subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
1405 1411
1406 # Update shortcuts to latest versions 1412 # Update shortcuts to latest versions
1407 - subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"]) 1413 + subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
1414 + runDir + "/results/summary_latest.csv",
1415 + runDir + "/results/families_latest.csv"
1416 + ])
1408 subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"]) 1417 subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
1409 subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"]) 1418 subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
1410 subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"]) 1419 subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
...@@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10): ...@@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10):
1631 1640
1632 @trace_unhandled_exceptions 1641 @trace_unhandled_exceptions
1633 def sql_execute(conn, sql, many=False, data=None, warn_every=10): 1642 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
1643 + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
1634 for _ in range(100): # retry 100 times if it fails 1644 for _ in range(100): # retry 100 times if it fails
1635 try: 1645 try:
1636 if many: 1646 if many:
...@@ -2397,6 +2407,7 @@ if __name__ == "__main__": ...@@ -2397,6 +2407,7 @@ if __name__ == "__main__":
2397 rfam_acc_to_download[c.mapping.rfam_acc] = [ c ] 2407 rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
2398 else: 2408 else:
2399 rfam_acc_to_download[c.mapping.rfam_acc].append(c) 2409 rfam_acc_to_download[c.mapping.rfam_acc].append(c)
2410 +
2400 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") 2411 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
2401 pp.fam_list = sorted(rfam_acc_to_download.keys()) 2412 pp.fam_list = sorted(rfam_acc_to_download.keys())
2402 2413
......
1 # This is a script supposed to be run periodically as a cron job 1 # This is a script supposed to be run periodically as a cron job
2 2
3 +cd /home/lbecquey/Projects/RNANet
4 +rm -f latest_run.log errors.txt
5 +
3 # Run RNANet 6 # Run RNANet
4 -cd /home/lbecquey/Projects/RNANet; 7 +bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log
5 -rm -f stdout.txt stderr.txt errors.txt; 8 +touch results/RNANet.db # update last modification date
6 -time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt; 9 +rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
7 10
8 -# Sync in Seafile 11 +# Compress
9 -seaf-cli start; 12 +rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz
13 +echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log
14 +gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db
15 +echo 'Recreated it.' >> latest_run.log
10 16
11 -seaf-cli stop; 17 +# Sync in Seafile
18 +seaf-cli start >> latest_run.log 2>&1
19 +echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
20 +sleep 10m
21 +echo `seaf-cli status` >> latest_run.log
22 +seaf-cli stop >> latest_run.log 2>&1
23 +echo 'We are '`date`', update completed.' >> latest_run.log
12 24
......
...@@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do ...@@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do
10 fi 10 fi
11 done 11 done
12 12
13 +PROCESS_TO_KILL="statistics.py"
14 +PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'`
15 +KILLED=
16 +for KILLPID in $PROCESS_LIST; do
17 + if [ ! -z $KILLPID ];then
18 + kill -9 $KILLPID
19 + echo "Killed PID ${KILLPID}"
20 + KILLED=yes
21 + fi
22 +done
23 +
13 if [ -z $KILLED ];then 24 if [ -z $KILLED ];then
14 echo "Didn't kill anything" 25 echo "Didn't kill anything"
15 fi 26 fi
......
1 +1ml5_1_a_1-2914
2 +1ml5_1_a_151-2903
3 +1ml5_1_A_7-1518
4 +1ml5_1_A_7-1515
5 +1ml5_1_A_2-1520
6 +1ml5_1_b_5-121
7 +2rdo_1_A_3-118
8 +4v48_1_A9_3-118
9 +4v47_1_A9_3-118
10 +1vy7_1_AY_1-73
11 +1vy7_1_CY_1-73
12 +4w2h_1_CY_1-73
13 +6zmi_1_L8_1267-4755
14 +6zm7_1_L8_1267-4755
15 +6y6x_1_L8_1267-4755
16 +6z6n_1_L8_1267-4755
17 +6qzp_1_L8_1267-4755
18 +6zme_1_L8_1267-4755
19 +6z6l_1_L8_1267-4755
20 +6ek0_1_L8_1267-4755
21 +6zmo_1_L8_1267-4755
22 +6z6m_1_L8_1267-4755
23 +6ole_1_D_1267-4755
24 +6om0_1_D_1267-4755
25 +6y2l_1_L8_1267-4755
26 +6y0g_1_L8_1267-4755
27 +6oli_1_D_1267-4755
28 +6olg_1_A3_1267-4755
29 +6y57_1_L8_1267-4755
30 +5t2c_1_C_1267-4755
31 +6om7_1_D_1267-4755
32 +4ug0_1_L8_1267-4755
33 +6olf_1_D_1267-4755
34 +6ip5_1_1C_1267-4755
35 +6ip8_1_1C_1267-4755
36 +6olz_1_A3_1267-4755
37 +5aj0_1_A3_1267-4755
38 +5lks_1_L8_1267-4755
39 +6ip6_1_1C_1267-4755
40 +4v6x_1_A8_1267-4755
41 +2z9q_1_A_1-72
42 +1ls2_1_B_1-73
43 +3ep2_1_Y_1-72
44 +3eq3_1_Y_1-72
45 +4v48_1_A6_1-73
46 +1gsg_1_T_1-72
47 +3jcr_1_H_1-115
48 +1eg0_1_O_1-73
49 +4v42_1_BB_5-121
50 +4v42_1_BA_1-2914
51 +4v42_1_BA_151-2903
52 +2ob7_1_A_10-319
53 +1x1l_1_A_1-130
54 +1zc8_1_Z_1-130
55 +1zc8_1_Z_1-91
56 +2ob7_1_D_1-130
57 +1r2x_1_C_1-58
58 +1r2w_1_C_1-58
59 +1eg0_1_L_1-56
60 +1eg0_1_L_1-57
61 +6rxu_1_C2_588-2386
62 +6rxu_1_C2_588-2383
63 +6rxu_1_C2_583-2388
64 +5oql_1_2_588-2386
65 +5oql_1_2_588-2383
66 +5oql_1_2_583-2388
67 +6rxv_1_C2_588-2386
68 +6rxv_1_C2_588-2383
69 +6rxv_1_C2_583-2388
70 +6rxz_1_C2_588-2386
71 +6rxz_1_C2_588-2383
72 +6rxz_1_C2_583-2388
73 +6rxy_1_C2_588-2386
74 +6rxy_1_C2_588-2383
75 +6rxy_1_C2_583-2388
76 +6rxt_1_C2_588-2386
77 +6rxt_1_C2_588-2383
78 +6rxt_1_C2_583-2388
79 +4v48_1_BA_1-91
80 +4v48_1_BA_6-1541
81 +4v48_1_BA_6-1538
82 +4v48_1_BA_1-1543
83 +4v47_1_BA_1-91
84 +4v47_1_BA_6-1540
85 +4v47_1_BA_6-1537
86 +4v47_1_BA_1-1542
87 +2rdo_1_B_6-1460
88 +2rdo_1_B_6-1522
89 +2rdo_1_B_1-2903
90 +2rdo_1_B_6-1457
91 +2rdo_1_B_1-2904
92 +2rdo_1_B_1-1528
93 +2rdo_1_B_160-2893
94 +4v48_1_A0_6-1460
95 +4v48_1_A0_6-1522
96 +4v48_1_A0_1-2903
97 +4v48_1_A0_6-1457
98 +4v48_1_A0_1-2904
99 +4v48_1_A0_1-1528
100 +4v48_1_A0_160-2893
101 +4v47_1_A0_6-1460
102 +4v47_1_A0_6-1522
103 +4v47_1_A0_1-2903
104 +4v47_1_A0_6-1457
105 +4v47_1_A0_1-2904
106 +4v47_1_A0_1-1528
107 +4v47_1_A0_160-2893
108 +1zc8_1_A_1-59
109 +1mvr_1_D_1-59
110 +4c9d_1_D_29-1
111 +4c9d_1_C_29-1
112 +4adx_1_9_1-121
113 +1zn1_1_B_1-59
114 +1emi_1_B_1-108
115 +3iy9_1_A_498-1027
116 +1jgq_1_A_20-55
117 +1jgq_1_A_7-1518
118 +1jgq_1_A_7-1515
119 +1jgq_1_A_2-1520
120 +4v42_1_AA_20-55
121 +4v42_1_AA_7-1518
122 +4v42_1_AA_7-1515
123 +4v42_1_AA_2-1520
124 +1jgo_1_A_20-55
125 +1jgo_1_A_7-1518
126 +1jgo_1_A_7-1515
127 +1jgo_1_A_2-1520
128 +1jgp_1_A_20-55
129 +1jgp_1_A_7-1518
130 +1jgp_1_A_7-1515
131 +1jgp_1_A_2-1520
132 +3ep2_1_B_1-50
133 +3eq3_1_B_1-50
134 +3eq4_1_B_1-50
135 +3pgw_1_R_1-164
136 +3pgw_1_N_1-164
137 +3cw1_1_x_1-138
138 +3cw1_1_w_1-138
139 +3cw1_1_V_1-138
140 +3cw1_1_v_1-138
141 +2iy3_1_B_9-105
142 +3jcr_1_N_1-106
143 +3jcr_1_N_1-188
144 +2vaz_1_A_64-177
145 +2ftc_1_R_81-1466
146 +2ftc_1_R_1-1568
147 +2ftc_1_R_792-1568
148 +3jcr_1_M_1-141
149 +3jcr_1_M_1-107
150 +3jcr_1_M_1-188
151 +4v5z_1_B0_1-2840
152 +4v5z_1_B0_1-2899
153 +4v5z_1_B0_1-2902
154 +5g2x_1_A_595-692
155 +3iy8_1_A_1-540
156 +4v5z_1_BY_2-113
157 +4v5z_1_BZ_1-70
158 +1mvr_1_B_1-96
159 +4adx_1_0_1-2923
160 +4adx_1_0_132-2915
161 +3eq4_1_Y_1-69
162 +4v5z_1_AA_1-1562
163 +4v5z_1_AA_1-1563
164 +6lqm_1_8_1267-4755
165 +6lu8_1_8_1267-4755
166 +6lsr_1_8_1267-4755
167 +6lss_1_8_1267-4755
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.