Louis BECQUEY

Merge branch 'master' of https://github.com/persalteas/RNANet into master

...@@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n ...@@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n
7 This script prepares the dataset from available public data in PDB and Rfam. 7 This script prepares the dataset from available public data in PDB and Rfam.
8 8
9 9
10 -**Please cite**: *Coming soon, expect it summer 2020* 10 +**Please cite**: *Coming soon, expect it in 2021*
11 11
12 # What it does 12 # What it does
13 The script follows these steps: 13 The script follows these steps:
...@@ -72,7 +72,7 @@ You need to install: ...@@ -72,7 +72,7 @@ You need to install:
72 72
73 ## Command line 73 ## Command line
74 Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`. 74 Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`.
75 -It requires solid hardware to run. It takes around 15 hours the first time, and 9h then, tested on a server with 32 cores and 48GB of RAM. 75 +It requires solid hardware to run. It takes around around 12 to 15 hours the first time, and 1 to 3h then, tested on a server with 32 cores and 48GB of RAM.
76 The detailed list of options is below: 76 The detailed list of options is below:
77 77
78 ``` 78 ```
......
...@@ -273,32 +273,39 @@ class Chain: ...@@ -273,32 +273,39 @@ class Chain:
273 if self.mapping is not None: 273 if self.mapping is not None:
274 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}") 274 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
275 275
276 - if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]: 276 + try:
277 - # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end 277 + if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
278 - 278 + # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
279 - if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup: 279 +
280 - # They are all contiguous in the chain 280 + if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
281 - # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618... 281 + # They are all contiguous in the chain
282 - # there is a redundancy (611) followed by a gap (611-617). 282 + # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
283 - # We want the redundancy to fill the gap. 283 + # there is a redundancy (611) followed by a gap (611-617).
284 - df.iloc[i:i+n_dup-1, 1] += 1 284 + # We want the redundancy to fill the gap.
285 + df.iloc[i:i+n_dup-1, 1] += 1
286 + else:
287 + # We solve the problem continous component by continuous component
288 + for j in range(1, n_dup+1):
289 + if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
290 + df.iloc[i+j-1,1] += 1
291 + else:
292 + break
293 + elif df.iloc[i,1] == df.iloc[i-1,1]:
294 + # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
295 + # Solution : we shift the numbering of 17A (to 18) and the following residues.
296 + df.iloc[i:, 1] += 1
285 else: 297 else:
286 - # We solve the problem continous component by continuous component 298 + # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
287 - for j in range(1, n_dup+1): 299 + # Here the 163B is read 163 by DSSR, but there already is a residue 163.
288 - if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous 300 + # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
289 - df.iloc[i+j-1,1] += 1 301 + df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
290 - else: 302 + df.iloc[i+1:, 1] += 1
291 - break 303 + except:
292 - elif df.iloc[i,1] == df.iloc[i-1,1]: 304 + warn(f"Error with parsing of {self.chain_label} duplicate residue numbers. Ignoring it.")
293 - # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. 305 + self.delete_me = True
294 - # Solution : we shift the numbering of 17A (to 18) and the following residues. 306 + self.error_messages = f"Error with parsing of duplicate residues numbers."
295 - df.iloc[i:, 1] += 1 307 + return None
296 - else: 308 +
297 - # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
298 - # Here the 163B is read 163 by DSSR, but there already is a residue 163.
299 - # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
300 - df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
301 - df.iloc[i+1:, 1] += 1
302 309
303 # Search for ligands at the end of the selection 310 # Search for ligands at the end of the selection
304 # Drop ligands detected as residues by DSSR, by detecting several markers 311 # Drop ligands detected as residues by DSSR, by detecting several markers
...@@ -1019,7 +1026,7 @@ class Pipeline: ...@@ -1019,7 +1026,7 @@ class Pipeline:
1019 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 1026 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &")
1020 sys.exit() 1027 sys.exit()
1021 elif opt == '--version': 1028 elif opt == '--version':
1022 - print("RNANet 1.0 alpha ") 1029 + print("RNANet 1.1 beta")
1023 sys.exit() 1030 sys.exit()
1024 elif opt == "-r" or opt == "--resolution": 1031 elif opt == "-r" or opt == "--resolution":
1025 assert float(arg) > 0.0 and float(arg) <= 20.0 1032 assert float(arg) > 0.0 and float(arg) <= 20.0
...@@ -1382,7 +1389,7 @@ class Pipeline: ...@@ -1382,7 +1389,7 @@ class Pipeline:
1382 # Remove previous precomputed data 1389 # Remove previous precomputed data
1383 subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) 1390 subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
1384 for f in self.fam_list: 1391 for f in self.fam_list:
1385 - subprocess.run(["rm","-f", f"data/{f}.npy"]) 1392 + subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
1386 1393
1387 # Run statistics files 1394 # Run statistics files
1388 os.chdir(runDir) 1395 os.chdir(runDir)
...@@ -1390,13 +1397,12 @@ class Pipeline: ...@@ -1390,13 +1397,12 @@ class Pipeline:
1390 subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data]) 1397 subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
1391 1398
1392 # Save additional informations 1399 # Save additional informations
1393 - conn = sqlite3.connect(runDir+"/results/RNANet.db") 1400 + with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
1394 - pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 1401 + pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
1395 - conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) 1402 + conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
1396 - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 1403 + pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
1397 - JOIN chain ON structure.pdb_id = chain.structure_id 1404 + JOIN chain ON structure.pdb_id = chain.structure_id
1398 - ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) 1405 + ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
1399 - conn.close()
1400 1406
1401 # Archive the results 1407 # Archive the results
1402 if self.SELECT_ONLY is None: 1408 if self.SELECT_ONLY is None:
...@@ -1404,7 +1410,10 @@ class Pipeline: ...@@ -1404,7 +1410,10 @@ class Pipeline:
1404 subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) 1410 subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
1405 1411
1406 # Update shortcuts to latest versions 1412 # Update shortcuts to latest versions
1407 - subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"]) 1413 + subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
1414 + runDir + "/results/summary_latest.csv",
1415 + runDir + "/results/families_latest.csv"
1416 + ])
1408 subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"]) 1417 subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
1409 subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"]) 1418 subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
1410 subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"]) 1419 subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
...@@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10): ...@@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10):
1631 1640
1632 @trace_unhandled_exceptions 1641 @trace_unhandled_exceptions
1633 def sql_execute(conn, sql, many=False, data=None, warn_every=10): 1642 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
1643 + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
1634 for _ in range(100): # retry 100 times if it fails 1644 for _ in range(100): # retry 100 times if it fails
1635 try: 1645 try:
1636 if many: 1646 if many:
...@@ -2397,6 +2407,7 @@ if __name__ == "__main__": ...@@ -2397,6 +2407,7 @@ if __name__ == "__main__":
2397 rfam_acc_to_download[c.mapping.rfam_acc] = [ c ] 2407 rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
2398 else: 2408 else:
2399 rfam_acc_to_download[c.mapping.rfam_acc].append(c) 2409 rfam_acc_to_download[c.mapping.rfam_acc].append(c)
2410 +
2400 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") 2411 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
2401 pp.fam_list = sorted(rfam_acc_to_download.keys()) 2412 pp.fam_list = sorted(rfam_acc_to_download.keys())
2402 2413
......
1 # This is a script supposed to be run periodically as a cron job 1 # This is a script supposed to be run periodically as a cron job
2 2
3 +cd /home/lbecquey/Projects/RNANet
4 +rm -f latest_run.log errors.txt
5 +
3 # Run RNANet 6 # Run RNANet
4 -cd /home/lbecquey/Projects/RNANet; 7 +bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log
5 -rm -f stdout.txt stderr.txt errors.txt; 8 +touch results/RNANet.db # update last modification date
6 -time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt; 9 +rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
7 10
8 -# Sync in Seafile 11 +# Compress
9 -seaf-cli start; 12 +rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz
13 +echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log
14 +gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db
15 +echo 'Recreated it.' >> latest_run.log
10 16
11 -seaf-cli stop; 17 +# Sync in Seafile
18 +seaf-cli start >> latest_run.log 2>&1
19 +echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
20 +sleep 10m
21 +echo `seaf-cli status` >> latest_run.log
22 +seaf-cli stop >> latest_run.log 2>&1
23 +echo 'We are '`date`', update completed.' >> latest_run.log
12 24
......
...@@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do ...@@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do
10 fi 10 fi
11 done 11 done
12 12
13 +PROCESS_TO_KILL="statistics.py"
14 +PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'`
15 +KILLED=
16 +for KILLPID in $PROCESS_LIST; do
17 + if [ ! -z $KILLPID ];then
18 + kill -9 $KILLPID
19 + echo "Killed PID ${KILLPID}"
20 + KILLED=yes
21 + fi
22 +done
23 +
13 if [ -z $KILLED ];then 24 if [ -z $KILLED ];then
14 echo "Didn't kill anything" 25 echo "Didn't kill anything"
15 fi 26 fi
......
1 +1ml5_1_a_1-2914
2 +1ml5_1_a_151-2903
3 +1ml5_1_A_7-1518
4 +1ml5_1_A_7-1515
5 +1ml5_1_A_2-1520
6 +1ml5_1_b_5-121
7 +2rdo_1_A_3-118
8 +4v48_1_A9_3-118
9 +4v47_1_A9_3-118
10 +1vy7_1_AY_1-73
11 +1vy7_1_CY_1-73
12 +4w2h_1_CY_1-73
13 +6zmi_1_L8_1267-4755
14 +6zm7_1_L8_1267-4755
15 +6y6x_1_L8_1267-4755
16 +6z6n_1_L8_1267-4755
17 +6qzp_1_L8_1267-4755
18 +6zme_1_L8_1267-4755
19 +6z6l_1_L8_1267-4755
20 +6ek0_1_L8_1267-4755
21 +6zmo_1_L8_1267-4755
22 +6z6m_1_L8_1267-4755
23 +6ole_1_D_1267-4755
24 +6om0_1_D_1267-4755
25 +6y2l_1_L8_1267-4755
26 +6y0g_1_L8_1267-4755
27 +6oli_1_D_1267-4755
28 +6olg_1_A3_1267-4755
29 +6y57_1_L8_1267-4755
30 +5t2c_1_C_1267-4755
31 +6om7_1_D_1267-4755
32 +4ug0_1_L8_1267-4755
33 +6olf_1_D_1267-4755
34 +6ip5_1_1C_1267-4755
35 +6ip8_1_1C_1267-4755
36 +6olz_1_A3_1267-4755
37 +5aj0_1_A3_1267-4755
38 +5lks_1_L8_1267-4755
39 +6ip6_1_1C_1267-4755
40 +4v6x_1_A8_1267-4755
41 +2z9q_1_A_1-72
42 +1ls2_1_B_1-73
43 +3ep2_1_Y_1-72
44 +3eq3_1_Y_1-72
45 +4v48_1_A6_1-73
46 +1gsg_1_T_1-72
47 +3jcr_1_H_1-115
48 +1eg0_1_O_1-73
49 +4v42_1_BB_5-121
50 +4v42_1_BA_1-2914
51 +4v42_1_BA_151-2903
52 +2ob7_1_A_10-319
53 +1x1l_1_A_1-130
54 +1zc8_1_Z_1-130
55 +1zc8_1_Z_1-91
56 +2ob7_1_D_1-130
57 +1r2x_1_C_1-58
58 +1r2w_1_C_1-58
59 +1eg0_1_L_1-56
60 +1eg0_1_L_1-57
61 +6rxu_1_C2_588-2386
62 +6rxu_1_C2_588-2383
63 +6rxu_1_C2_583-2388
64 +5oql_1_2_588-2386
65 +5oql_1_2_588-2383
66 +5oql_1_2_583-2388
67 +6rxv_1_C2_588-2386
68 +6rxv_1_C2_588-2383
69 +6rxv_1_C2_583-2388
70 +6rxz_1_C2_588-2386
71 +6rxz_1_C2_588-2383
72 +6rxz_1_C2_583-2388
73 +6rxy_1_C2_588-2386
74 +6rxy_1_C2_588-2383
75 +6rxy_1_C2_583-2388
76 +6rxt_1_C2_588-2386
77 +6rxt_1_C2_588-2383
78 +6rxt_1_C2_583-2388
79 +4v48_1_BA_1-91
80 +4v48_1_BA_6-1541
81 +4v48_1_BA_6-1538
82 +4v48_1_BA_1-1543
83 +4v47_1_BA_1-91
84 +4v47_1_BA_6-1540
85 +4v47_1_BA_6-1537
86 +4v47_1_BA_1-1542
87 +2rdo_1_B_6-1460
88 +2rdo_1_B_6-1522
89 +2rdo_1_B_1-2903
90 +2rdo_1_B_6-1457
91 +2rdo_1_B_1-2904
92 +2rdo_1_B_1-1528
93 +2rdo_1_B_160-2893
94 +4v48_1_A0_6-1460
95 +4v48_1_A0_6-1522
96 +4v48_1_A0_1-2903
97 +4v48_1_A0_6-1457
98 +4v48_1_A0_1-2904
99 +4v48_1_A0_1-1528
100 +4v48_1_A0_160-2893
101 +4v47_1_A0_6-1460
102 +4v47_1_A0_6-1522
103 +4v47_1_A0_1-2903
104 +4v47_1_A0_6-1457
105 +4v47_1_A0_1-2904
106 +4v47_1_A0_1-1528
107 +4v47_1_A0_160-2893
108 +1zc8_1_A_1-59
109 +1mvr_1_D_1-59
110 +4c9d_1_D_29-1
111 +4c9d_1_C_29-1
112 +4adx_1_9_1-121
113 +1zn1_1_B_1-59
114 +1emi_1_B_1-108
115 +3iy9_1_A_498-1027
116 +1jgq_1_A_20-55
117 +1jgq_1_A_7-1518
118 +1jgq_1_A_7-1515
119 +1jgq_1_A_2-1520
120 +4v42_1_AA_20-55
121 +4v42_1_AA_7-1518
122 +4v42_1_AA_7-1515
123 +4v42_1_AA_2-1520
124 +1jgo_1_A_20-55
125 +1jgo_1_A_7-1518
126 +1jgo_1_A_7-1515
127 +1jgo_1_A_2-1520
128 +1jgp_1_A_20-55
129 +1jgp_1_A_7-1518
130 +1jgp_1_A_7-1515
131 +1jgp_1_A_2-1520
132 +3ep2_1_B_1-50
133 +3eq3_1_B_1-50
134 +3eq4_1_B_1-50
135 +3pgw_1_R_1-164
136 +3pgw_1_N_1-164
137 +3cw1_1_x_1-138
138 +3cw1_1_w_1-138
139 +3cw1_1_V_1-138
140 +3cw1_1_v_1-138
141 +2iy3_1_B_9-105
142 +3jcr_1_N_1-106
143 +3jcr_1_N_1-188
144 +2vaz_1_A_64-177
145 +2ftc_1_R_81-1466
146 +2ftc_1_R_1-1568
147 +2ftc_1_R_792-1568
148 +3jcr_1_M_1-141
149 +3jcr_1_M_1-107
150 +3jcr_1_M_1-188
151 +4v5z_1_B0_1-2840
152 +4v5z_1_B0_1-2899
153 +4v5z_1_B0_1-2902
154 +5g2x_1_A_595-692
155 +3iy8_1_A_1-540
156 +4v5z_1_BY_2-113
157 +4v5z_1_BZ_1-70
158 +1mvr_1_B_1-96
159 +4adx_1_0_1-2923
160 +4adx_1_0_132-2915
161 +3eq4_1_Y_1-69
162 +4v5z_1_AA_1-1562
163 +4v5z_1_AA_1-1563
164 +6lqm_1_8_1267-4755
165 +6lu8_1_8_1267-4755
166 +6lsr_1_8_1267-4755
167 +6lss_1_8_1267-4755
1 +1ml5_1_a_1-2914
2 +Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3 +
4 +1ml5_1_a_151-2903
5 +Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6 +
7 +1ml5_1_A_7-1518
8 +Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
9 +
10 +1ml5_1_A_7-1515
11 +Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
12 +
13 +1ml5_1_A_2-1520
14 +Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
15 +
16 +1ml5_1_b_5-121
17 +Could not find nucleotides of chain b in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
18 +
19 +2rdo_1_A_3-118
20 +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_A_3-118.
21 +
22 +4v48_1_A9_3-118
23 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A9_3-118.
24 +
25 +4v47_1_A9_3-118
26 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A9_3-118.
27 +
28 +1vy7_1_AY_1-73
29 +Sequence is too short. (< 5 resolved nts)
30 +
31 +1vy7_1_CY_1-73
32 +Sequence is too short. (< 5 resolved nts)
33 +
34 +4w2h_1_CY_1-73
35 +Sequence is too short. (< 5 resolved nts)
36 +
37 +6zmi_1_L8_1267-4755
38 +Could not find nucleotides of chain L8 in annotation 6zmi.json. Either there is a problem with 6zmi mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
39 +
40 +6zm7_1_L8_1267-4755
41 +Could not find nucleotides of chain L8 in annotation 6zm7.json. Either there is a problem with 6zm7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
42 +
43 +6y6x_1_L8_1267-4755
44 +Could not find nucleotides of chain L8 in annotation 6y6x.json. Either there is a problem with 6y6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
45 +
46 +6z6n_1_L8_1267-4755
47 +Could not find nucleotides of chain L8 in annotation 6z6n.json. Either there is a problem with 6z6n mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
48 +
49 +6qzp_1_L8_1267-4755
50 +Could not find nucleotides of chain L8 in annotation 6qzp.json. Either there is a problem with 6qzp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
51 +
52 +6zme_1_L8_1267-4755
53 +Could not find nucleotides of chain L8 in annotation 6zme.json. Either there is a problem with 6zme mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
54 +
55 +6z6l_1_L8_1267-4755
56 +Could not find nucleotides of chain L8 in annotation 6z6l.json. Either there is a problem with 6z6l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
57 +
58 +6ek0_1_L8_1267-4755
59 +Could not find nucleotides of chain L8 in annotation 6ek0.json. Either there is a problem with 6ek0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
60 +
61 +6zmo_1_L8_1267-4755
62 +Could not find nucleotides of chain L8 in annotation 6zmo.json. Either there is a problem with 6zmo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
63 +
64 +6z6m_1_L8_1267-4755
65 +Could not find nucleotides of chain L8 in annotation 6z6m.json. Either there is a problem with 6z6m mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
66 +
67 +6ole_1_D_1267-4755
68 +Could not find nucleotides of chain D in annotation 6ole.json. Either there is a problem with 6ole mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
69 +
70 +6om0_1_D_1267-4755
71 +Could not find nucleotides of chain D in annotation 6om0.json. Either there is a problem with 6om0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
72 +
73 +6y2l_1_L8_1267-4755
74 +Could not find nucleotides of chain L8 in annotation 6y2l.json. Either there is a problem with 6y2l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
75 +
76 +6y0g_1_L8_1267-4755
77 +Could not find nucleotides of chain L8 in annotation 6y0g.json. Either there is a problem with 6y0g mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
78 +
79 +6oli_1_D_1267-4755
80 +Could not find nucleotides of chain D in annotation 6oli.json. Either there is a problem with 6oli mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
81 +
82 +6olg_1_A3_1267-4755
83 +Could not find nucleotides of chain A3 in annotation 6olg.json. Either there is a problem with 6olg mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
84 +
85 +6y57_1_L8_1267-4755
86 +Could not find nucleotides of chain L8 in annotation 6y57.json. Either there is a problem with 6y57 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
87 +
88 +5t2c_1_C_1267-4755
89 +Could not find nucleotides of chain C in annotation 5t2c.json. Either there is a problem with 5t2c mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
90 +
91 +6om7_1_D_1267-4755
92 +Could not find nucleotides of chain D in annotation 6om7.json. Either there is a problem with 6om7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
93 +
94 +4ug0_1_L8_1267-4755
95 +Could not find nucleotides of chain L8 in annotation 4ug0.json. Either there is a problem with 4ug0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
96 +
97 +6olf_1_D_1267-4755
98 +Could not find nucleotides of chain D in annotation 6olf.json. Either there is a problem with 6olf mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
99 +
100 +6ip5_1_1C_1267-4755
101 +Could not find nucleotides of chain 1C in annotation 6ip5.json. Either there is a problem with 6ip5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
102 +
103 +6ip8_1_1C_1267-4755
104 +Could not find nucleotides of chain 1C in annotation 6ip8.json. Either there is a problem with 6ip8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
105 +
106 +6olz_1_A3_1267-4755
107 +Could not find nucleotides of chain A3 in annotation 6olz.json. Either there is a problem with 6olz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
108 +
109 +5aj0_1_A3_1267-4755
110 +Could not find nucleotides of chain A3 in annotation 5aj0.json. Either there is a problem with 5aj0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
111 +
112 +5lks_1_L8_1267-4755
113 +Could not find nucleotides of chain L8 in annotation 5lks.json. Either there is a problem with 5lks mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
114 +
115 +6ip6_1_1C_1267-4755
116 +Could not find nucleotides of chain 1C in annotation 6ip6.json. Either there is a problem with 6ip6 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
117 +
118 +4v6x_1_A8_1267-4755
119 +Could not find nucleotides of chain A8 in annotation 4v6x.json. Either there is a problem with 4v6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
120 +
121 +2z9q_1_A_1-72
122 +DSSR warning 2z9q.json: no nucleotides found. Ignoring 2z9q_1_A_1-72.
123 +
124 +1ls2_1_B_1-73
125 +DSSR warning 1ls2.json: no nucleotides found. Ignoring 1ls2_1_B_1-73.
126 +
127 +3ep2_1_Y_1-72
128 +DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_Y_1-72.
129 +
130 +3eq3_1_Y_1-72
131 +DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_Y_1-72.
132 +
133 +4v48_1_A6_1-73
134 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A6_1-73.
135 +
136 +1gsg_1_T_1-72
137 +DSSR warning 1gsg.json: no nucleotides found. Ignoring 1gsg_1_T_1-72.
138 +
139 +3jcr_1_H_1-115
140 +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_H_1-115.
141 +
142 +1eg0_1_O_1-73
143 +DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_O_1-73.
144 +
145 +4v42_1_BB_5-121
146 +Could not find nucleotides of chain BB in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
147 +
148 +4v42_1_BA_1-2914
149 +Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
150 +
151 +4v42_1_BA_151-2903
152 +Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
153 +
154 +2ob7_1_A_10-319
155 +DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_A_10-319.
156 +
157 +1x1l_1_A_1-130
158 +DSSR warning 1x1l.json: no nucleotides found. Ignoring 1x1l_1_A_1-130.
159 +
160 +1zc8_1_Z_1-130
161 +DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-130.
162 +
163 +1zc8_1_Z_1-91
164 +DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-91.
165 +
166 +2ob7_1_D_1-130
167 +DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_D_1-130.
168 +
169 +1r2x_1_C_1-58
170 +DSSR warning 1r2x.json: no nucleotides found. Ignoring 1r2x_1_C_1-58.
171 +
172 +1r2w_1_C_1-58
173 +DSSR warning 1r2w.json: no nucleotides found. Ignoring 1r2w_1_C_1-58.
174 +
175 +1eg0_1_L_1-56
176 +DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-56.
177 +
178 +1eg0_1_L_1-57
179 +DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-57.
180 +
181 +6rxu_1_C2_588-2386
182 +Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
183 +
184 +6rxu_1_C2_588-2383
185 +Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
186 +
187 +6rxu_1_C2_583-2388
188 +Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
189 +
190 +5oql_1_2_588-2386
191 +Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
192 +
193 +5oql_1_2_588-2383
194 +Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
195 +
196 +5oql_1_2_583-2388
197 +Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
198 +
199 +6rxv_1_C2_588-2386
200 +Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
201 +
202 +6rxv_1_C2_588-2383
203 +Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
204 +
205 +6rxv_1_C2_583-2388
206 +Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
207 +
208 +6rxz_1_C2_588-2386
209 +Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
210 +
211 +6rxz_1_C2_588-2383
212 +Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
213 +
214 +6rxz_1_C2_583-2388
215 +Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
216 +
217 +6rxy_1_C2_588-2386
218 +Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
219 +
220 +6rxy_1_C2_588-2383
221 +Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
222 +
223 +6rxy_1_C2_583-2388
224 +Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
225 +
226 +6rxt_1_C2_588-2386
227 +Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
228 +
229 +6rxt_1_C2_588-2383
230 +Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
231 +
232 +6rxt_1_C2_583-2388
233 +Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
234 +
235 +4v48_1_BA_1-91
236 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-91.
237 +
238 +4v48_1_BA_6-1541
239 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1541.
240 +
241 +4v48_1_BA_6-1538
242 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1538.
243 +
244 +4v48_1_BA_1-1543
245 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-1543.
246 +
247 +4v47_1_BA_1-91
248 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-91.
249 +
250 +4v47_1_BA_6-1540
251 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1540.
252 +
253 +4v47_1_BA_6-1537
254 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1537.
255 +
256 +4v47_1_BA_1-1542
257 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-1542.
258 +
259 +2rdo_1_B_6-1460
260 +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1460.
261 +
262 +2rdo_1_B_6-1522
263 +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1522.
264 +
265 +2rdo_1_B_1-2903
266 +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2903.
267 +
268 +2rdo_1_B_6-1457
269 +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1457.
270 +
271 +2rdo_1_B_1-2904
272 +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2904.
273 +
274 +2rdo_1_B_1-1528
275 +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-1528.
276 +
277 +2rdo_1_B_160-2893
278 +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_160-2893.
279 +
280 +4v48_1_A0_6-1460
281 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1460.
282 +
283 +4v48_1_A0_6-1522
284 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1522.
285 +
286 +4v48_1_A0_1-2903
287 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2903.
288 +
289 +4v48_1_A0_6-1457
290 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1457.
291 +
292 +4v48_1_A0_1-2904
293 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2904.
294 +
295 +4v48_1_A0_1-1528
296 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-1528.
297 +
298 +4v48_1_A0_160-2893
299 +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_160-2893.
300 +
301 +4v47_1_A0_6-1460
302 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1460.
303 +
304 +4v47_1_A0_6-1522
305 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1522.
306 +
307 +4v47_1_A0_1-2903
308 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2903.
309 +
310 +4v47_1_A0_6-1457
311 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1457.
312 +
313 +4v47_1_A0_1-2904
314 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2904.
315 +
316 +4v47_1_A0_1-1528
317 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-1528.
318 +
319 +4v47_1_A0_160-2893
320 +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_160-2893.
321 +
322 +1zc8_1_A_1-59
323 +DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_A_1-59.
324 +
325 +1mvr_1_D_1-59
326 +DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_D_1-59.
327 +
328 +4c9d_1_D_29-1
329 +Mapping is reversed, this case is not supported (yet).
330 +
331 +4c9d_1_C_29-1
332 +Mapping is reversed, this case is not supported (yet).
333 +
334 +4adx_1_9_1-121
335 +DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_9_1-121.
336 +
337 +1zn1_1_B_1-59
338 +DSSR warning 1zn1.json: no nucleotides found. Ignoring 1zn1_1_B_1-59.
339 +
340 +1emi_1_B_1-108
341 +DSSR warning 1emi.json: no nucleotides found. Ignoring 1emi_1_B_1-108.
342 +
343 +3iy9_1_A_498-1027
344 +DSSR warning 3iy9.json: no nucleotides found. Ignoring 3iy9_1_A_498-1027.
345 +
346 +1jgq_1_A_20-55
347 +Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
348 +
349 +1jgq_1_A_7-1518
350 +Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
351 +
352 +1jgq_1_A_7-1515
353 +Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
354 +
355 +1jgq_1_A_2-1520
356 +Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
357 +
358 +4v42_1_AA_20-55
359 +Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
360 +
361 +4v42_1_AA_7-1518
362 +Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
363 +
364 +4v42_1_AA_7-1515
365 +Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
366 +
367 +4v42_1_AA_2-1520
368 +Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
369 +
370 +1jgo_1_A_20-55
371 +Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
372 +
373 +1jgo_1_A_7-1518
374 +Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
375 +
376 +1jgo_1_A_7-1515
377 +Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
378 +
379 +1jgo_1_A_2-1520
380 +Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
381 +
382 +1jgp_1_A_20-55
383 +Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
384 +
385 +1jgp_1_A_7-1518
386 +Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
387 +
388 +1jgp_1_A_7-1515
389 +Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
390 +
391 +1jgp_1_A_2-1520
392 +Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
393 +
394 +3ep2_1_B_1-50
395 +DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_B_1-50.
396 +
397 +3eq3_1_B_1-50
398 +DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_B_1-50.
399 +
400 +3eq4_1_B_1-50
401 +DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_B_1-50.
402 +
403 +3pgw_1_R_1-164
404 +DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R_1-164.
405 +
406 +3pgw_1_N_1-164
407 +DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N_1-164.
408 +
409 +3cw1_1_x_1-138
410 +DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_x_1-138.
411 +
412 +3cw1_1_w_1-138
413 +DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_w_1-138.
414 +
415 +3cw1_1_V_1-138
416 +DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V_1-138.
417 +
418 +3cw1_1_v_1-138
419 +DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_v_1-138.
420 +
421 +2iy3_1_B_9-105
422 +DSSR warning 2iy3.json: no nucleotides found. Ignoring 2iy3_1_B_9-105.
423 +
424 +3jcr_1_N_1-106
425 +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-106.
426 +
427 +3jcr_1_N_1-188
428 +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-188.
429 +
430 +2vaz_1_A_64-177
431 +DSSR warning 2vaz.json: no nucleotides found. Ignoring 2vaz_1_A_64-177.
432 +
433 +2ftc_1_R_81-1466
434 +DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_81-1466.
435 +
436 +2ftc_1_R_1-1568
437 +DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_1-1568.
438 +
439 +2ftc_1_R_792-1568
440 +DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_792-1568.
441 +
442 +3jcr_1_M_1-141
443 +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-141.
444 +
445 +3jcr_1_M_1-107
446 +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-107.
447 +
448 +3jcr_1_M_1-188
449 +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-188.
450 +
451 +4v5z_1_B0_1-2840
452 +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2840.
453 +
454 +4v5z_1_B0_1-2899
455 +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2899.
456 +
457 +4v5z_1_B0_1-2902
458 +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2902.
459 +
460 +5g2x_1_A_595-692
461 +Sequence is too short. (< 5 resolved nts)
462 +
463 +3iy8_1_A_1-540
464 +DSSR warning 3iy8.json: no nucleotides found. Ignoring 3iy8_1_A_1-540.
465 +
466 +4v5z_1_BY_2-113
467 +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BY_2-113.
468 +
469 +4v5z_1_BZ_1-70
470 +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BZ_1-70.
471 +
472 +1mvr_1_B_1-96
473 +DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_B_1-96.
474 +
475 +4adx_1_0_1-2923
476 +DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_1-2923.
477 +
478 +4adx_1_0_132-2915
479 +DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_132-2915.
480 +
481 +3eq4_1_Y_1-69
482 +DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_Y_1-69.
483 +
484 +4v5z_1_AA_1-1562
485 +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1562.
486 +
487 +4v5z_1_AA_1-1563
488 +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1563.
489 +
490 +6lqm_1_8_1267-4755
491 +Could not find nucleotides of chain 8 in annotation 6lqm.json. Either there is a problem with 6lqm mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
492 +
493 +6lu8_1_8_1267-4755
494 +Could not find nucleotides of chain 8 in annotation 6lu8.json. Either there is a problem with 6lu8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
495 +
496 +6lsr_1_8_1267-4755
497 +Could not find nucleotides of chain 8 in annotation 6lsr.json. Either there is a problem with 6lsr mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
498 +
499 +6lss_1_8_1267-4755
500 +Could not find nucleotides of chain 8 in annotation 6lss.json. Either there is a problem with 6lss mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
501 +
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
5 # in the database. 5 # in the database.
6 # This should be run from the folder where the file is (to access the database with path "results/RNANet.db") 6 # This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
7 7
8 -import os, pickle, sqlite3, sys 8 +import os, pickle, sqlite3, shlex, subprocess, sys
9 import numpy as np 9 import numpy as np
10 import pandas as pd 10 import pandas as pd
11 import threading as th 11 import threading as th
...@@ -16,14 +16,13 @@ import matplotlib.patches as mpatches ...@@ -16,14 +16,13 @@ import matplotlib.patches as mpatches
16 import scipy.cluster.hierarchy as sch 16 import scipy.cluster.hierarchy as sch
17 from scipy.spatial.distance import squareform 17 from scipy.spatial.distance import squareform
18 from mpl_toolkits.mplot3d import axes3d 18 from mpl_toolkits.mplot3d import axes3d
19 -from Bio.Phylo.TreeConstruction import DistanceCalculator
20 from Bio import AlignIO, SeqIO 19 from Bio import AlignIO, SeqIO
21 from functools import partial 20 from functools import partial
22 -from multiprocessing import Pool 21 +from multiprocessing import Pool, Manager
23 from os import path 22 from os import path
24 from tqdm import tqdm 23 from tqdm import tqdm
25 from collections import Counter 24 from collections import Counter
26 -from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker 25 +from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
27 26
28 # This sets the paths 27 # This sets the paths
29 if len(sys.argv) > 1: 28 if len(sys.argv) > 1:
...@@ -37,7 +36,7 @@ else: ...@@ -37,7 +36,7 @@ else:
37 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 36 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
38 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 37 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
39 38
40 -def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): 39 +def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
41 """ 40 """
42 Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. 41 Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
43 See Wadley & Pyle (2007) 42 See Wadley & Pyle (2007)
...@@ -68,6 +67,12 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -68,6 +67,12 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
68 67
69 68
70 if not path.isfile(f"data/wadley_kernel_{angle}.npz"): 69 if not path.isfile(f"data/wadley_kernel_{angle}.npz"):
70 +
71 + # Get a worker number to position the progress bar
72 + global idxQueue
73 + thr_idx = idxQueue.get()
74 + pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
75 +
71 # Extract the angle values of c2'-endo and c3'-endo nucleotides 76 # Extract the angle values of c2'-endo and c3'-endo nucleotides
72 with sqlite3.connect("results/RNANet.db") as conn: 77 with sqlite3.connect("results/RNANet.db") as conn:
73 df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn) 78 df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
...@@ -89,13 +94,17 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -89,13 +94,17 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
89 xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j] 94 xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
90 positions = np.vstack([xx.ravel(), yy.ravel()]) 95 positions = np.vstack([xx.ravel(), yy.ravel()])
91 f_c3 = np.reshape(kernel_c3(positions).T, xx.shape) 96 f_c3 = np.reshape(kernel_c3(positions).T, xx.shape)
97 + pbar.update(1)
92 f_c2 = np.reshape(kernel_c2(positions).T, xx.shape) 98 f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
99 + pbar.update(1)
93 100
94 # Save the data to an archive for later use without the need to recompute 101 # Save the data to an archive for later use without the need to recompute
95 np.savez(f"data/wadley_kernel_{angle}.npz", 102 np.savez(f"data/wadley_kernel_{angle}.npz",
96 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, 103 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
97 c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, 104 c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
98 kernel_c3=f_c3, kernel_c2=f_c2) 105 kernel_c3=f_c3, kernel_c2=f_c2)
106 + pbar.close()
107 + idxQueue.put(thr_idx)
99 else: 108 else:
100 f = np.load(f"data/wadley_kernel_{angle}.npz") 109 f = np.load(f"data/wadley_kernel_{angle}.npz")
101 c2_endo_etas = f["c2_endo_e"] 110 c2_endo_etas = f["c2_endo_e"]
...@@ -106,7 +115,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -106,7 +115,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
106 f_c2 = f["kernel_c2"] 115 f_c2 = f["kernel_c2"]
107 xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j] 116 xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
108 117
109 - notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).") 118 + # notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
110 119
111 # exact counts: 120 # exact counts:
112 hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), 121 hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1),
...@@ -139,7 +148,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -139,7 +148,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
139 fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png") 148 fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png")
140 if show: 149 if show:
141 fig.show() 150 fig.show()
142 - fig.close() 151 + plt.close()
143 152
144 # Smoothed joint distribution 153 # Smoothed joint distribution
145 fig = plt.figure() 154 fig = plt.figure()
...@@ -150,7 +159,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -150,7 +159,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
150 fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png") 159 fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png")
151 if show: 160 if show:
152 fig.show() 161 fig.show()
153 - fig.close() 162 + plt.close()
154 163
155 # 2D Wadley plot 164 # 2D Wadley plot
156 fig = plt.figure(figsize=(5,5)) 165 fig = plt.figure(figsize=(5,5))
...@@ -163,7 +172,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -163,7 +172,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
163 fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png") 172 fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png")
164 if show: 173 if show:
165 fig.show() 174 fig.show()
166 - fig.close() 175 + plt.close()
167 # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") 176 # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
168 177
169 def stats_len(): 178 def stats_len():
...@@ -171,11 +180,15 @@ def stats_len(): ...@@ -171,11 +180,15 @@ def stats_len():
171 180
172 REQUIRES tables chain, nucleotide up to date. 181 REQUIRES tables chain, nucleotide up to date.
173 """ 182 """
183 +
184 + # Get a worker number to position the progress bar
185 + global idxQueue
186 + thr_idx = idxQueue.get()
174 187
175 cols = [] 188 cols = []
176 lengths = [] 189 lengths = []
177 - conn = sqlite3.connect("results/RNANet.db") 190 +
178 - for i,f in enumerate(fam_list): 191 + for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
179 192
180 # Define a color for that family in the plot 193 # Define a color for that family in the plot
181 if f in LSU_set: 194 if f in LSU_set:
...@@ -190,11 +203,11 @@ def stats_len(): ...@@ -190,11 +203,11 @@ def stats_len():
190 cols.append("grey") 203 cols.append("grey")
191 204
192 # Get the lengths of chains 205 # Get the lengths of chains
193 - l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ] 206 + with sqlite3.connect("results/RNANet.db") as conn:
207 + l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ]
194 lengths.append(l) 208 lengths.append(l)
195 209
196 - notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") 210 + # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
197 - conn.close()
198 211
199 # Plot the figure 212 # Plot the figure
200 fig = plt.figure(figsize=(10,3)) 213 fig = plt.figure(figsize=(10,3))
...@@ -223,7 +236,8 @@ def stats_len(): ...@@ -223,7 +236,8 @@ def stats_len():
223 236
224 # Save the figure 237 # Save the figure
225 fig.savefig("results/figures/lengths.png") 238 fig.savefig("results/figures/lengths.png")
226 - notify("Computed sequence length statistics and saved the figure.") 239 + idxQueue.put(thr_idx) # replace the thread index in the queue
240 + # notify("Computed sequence length statistics and saved the figure.")
227 241
228 def format_percentage(tot, x): 242 def format_percentage(tot, x):
229 if not tot: 243 if not tot:
...@@ -242,40 +256,57 @@ def stats_freq(): ...@@ -242,40 +256,57 @@ def stats_freq():
242 256
243 Outputs results/frequencies.csv 257 Outputs results/frequencies.csv
244 REQUIRES tables chain, nucleotide up to date.""" 258 REQUIRES tables chain, nucleotide up to date."""
259 +
260 + # Get a worker number to position the progress bar
261 + global idxQueue
262 + thr_idx = idxQueue.get()
263 +
245 # Initialize a Counter object for each family 264 # Initialize a Counter object for each family
246 freqs = {} 265 freqs = {}
247 for f in fam_list: 266 for f in fam_list:
248 freqs[f] = Counter() 267 freqs[f] = Counter()
249 268
250 # List all nt_names happening within a RNA family and store the counts in the Counter 269 # List all nt_names happening within a RNA family and store the counts in the Counter
251 - conn = sqlite3.connect("results/RNANet.db") 270 + for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
252 - for i,f in enumerate(fam_list): 271 + with sqlite3.connect("results/RNANet.db") as conn:
253 - counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;")) 272 + counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
254 freqs[f].update(counts) 273 freqs[f].update(counts)
255 - notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.") 274 + # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
256 - conn.close()
257 275
258 # Create a pandas DataFrame, and save it to CSV. 276 # Create a pandas DataFrame, and save it to CSV.
259 df = pd.DataFrame() 277 df = pd.DataFrame()
260 - for f in fam_list: 278 + for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
261 tot = sum(freqs[f].values()) 279 tot = sum(freqs[f].values())
262 df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) 280 df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
263 df = df.fillna(0) 281 df = df.fillna(0)
264 df.to_csv("results/frequencies.csv") 282 df.to_csv("results/frequencies.csv")
265 - notify("Saved nucleotide frequencies to CSV file.") 283 + idxQueue.put(thr_idx) # replace the thread index in the queue
284 + # notify("Saved nucleotide frequencies to CSV file.")
266 285
267 def parallel_stats_pairs(f): 286 def parallel_stats_pairs(f):
268 """Counts occurrences of intra-chain base-pair types in one RNA family 287 """Counts occurrences of intra-chain base-pair types in one RNA family
269 288
270 REQUIRES tables chain, nucleotide up-to-date.""" 289 REQUIRES tables chain, nucleotide up-to-date."""
271 290
291 + if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"):
292 + return
293 +
294 + # Get a worker number to position the progress bar
295 + global idxQueue
296 + thr_idx = idxQueue.get()
297 +
272 chain_id_list = mappings_list[f] 298 chain_id_list = mappings_list[f]
273 data = [] 299 data = []
274 - for cid in chain_id_list: 300 + sqldata = []
301 + for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
275 with sqlite3.connect("results/RNANet.db") as conn: 302 with sqlite3.connect("results/RNANet.db") as conn:
276 # Get comma separated lists of basepairs per nucleotide 303 # Get comma separated lists of basepairs per nucleotide
277 - interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn) 304 + interactions = pd.DataFrame(
278 - 305 + sql_ask_database(conn,
306 + f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;",
307 + warn_every=0),
308 + columns = ["nt1", "index_chain", "paired", "pair_type_LW"]
309 + )
279 # expand the comma-separated lists in real lists 310 # expand the comma-separated lists in real lists
280 expanded_list = pd.concat([ pd.DataFrame({ 'nt1':[ row["nt1"] for x in row["paired"].split(',') ], 311 expanded_list = pd.concat([ pd.DataFrame({ 'nt1':[ row["nt1"] for x in row["paired"].split(',') ],
281 'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ], 312 'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ],
...@@ -317,27 +348,29 @@ def parallel_stats_pairs(f): ...@@ -317,27 +348,29 @@ def parallel_stats_pairs(f):
317 348
318 # Update the database 349 # Update the database
319 vlcnts = expanded_list.pair_type_LW.value_counts() 350 vlcnts = expanded_list.pair_type_LW.value_counts()
320 - sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, 351 + sqldata.append( ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0,
321 - vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, 352 + vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0,
322 - vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, 353 + vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0,
323 - vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, 354 + vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0,
324 - vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, 355 + vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0,
325 - vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, 356 + vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0,
326 - vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, 357 + vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0,
327 - vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, 358 + vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0,
328 - vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, 359 + vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0,
329 - vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, 360 + vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0,
330 - vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, 361 + vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0,
331 - vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, 362 + vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0,
332 - int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), 363 + int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2),
333 - cid) 364 + cid) )
334 - with sqlite3.connect("results/RNANet.db") as conn:
335 - sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
336 - pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
337 - pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata)
338 365
339 data.append(expanded_list) 366 data.append(expanded_list)
340 367
368 + # Update the database
369 + with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
370 + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
371 + sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
372 + pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
373 + pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", many=True, data=sqldata, warn_every=0)
341 374
342 # merge all the dataframes from all chains of the family 375 # merge all the dataframes from all chains of the family
343 expanded_list = pd.concat(data) 376 expanded_list = pd.concat(data)
...@@ -351,7 +384,106 @@ def parallel_stats_pairs(f): ...@@ -351,7 +384,106 @@ def parallel_stats_pairs(f):
351 384
352 # Create an output DataFrame 385 # Create an output DataFrame
353 f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) 386 f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
354 - return expanded_list, f_df 387 + f_df.to_csv(f"data/{f}_counts.csv")
388 + expanded_list.to_csv(f"data/{f}_pairs.csv")
389 +
390 + idxQueue.put(thr_idx) # replace the thread index in the queue
391 +
392 +def to_dist_matrix(f):
393 + if path.isfile("data/"+f+".npy"):
394 + # notify(f"Computed {f} distance matrix", "loaded from file")
395 + return 0
396 +
397 + # Get a worker number to position the progress bar
398 + global idxQueue
399 + thr_idx = idxQueue.get()
400 +
401 + # notify(f"Computing {f} distance matrix from alignment...")
402 + command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
403 +
404 + # Prepare a file
405 + with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
406 + al = AlignIO.read(al_file, "fasta")
407 + names = [ x.id for x in al if '[' in x.id ]
408 + al = al[-len(names):]
409 + with open(f + "_3d_only.stk", "w") as only_3d:
410 + only_3d.write(al.format("stockholm"))
411 + del al
412 +
413 + # Prepare the job
414 + process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
415 + id_matrix = np.zeros((len(names), len(names)))
416 +
417 + pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
418 + while process.poll() is None:
419 + output = process.stdout.readline()
420 + if output:
421 + lines = output.strip().split(b'\n')
422 + for l in lines:
423 + line = l.split()
424 + s1 = line[0].decode('utf-8')
425 + s2 = line[1].decode('utf-8')
426 + score = line[2].decode('utf-8')
427 + id1 = names.index(s1)
428 + id2 = names.index(s2)
429 + id_matrix[id1, id2] = float(score)
430 + pbar.update(1)
431 + pbar.close()
432 +
433 + subprocess.run(["rm", "-f", f + "_3d_only.stk"])
434 + np.save("data/"+f+".npy", id_matrix)
435 + idxQueue.put(thr_idx) # replace the thread index in the queue
436 + return 0
437 +
438 +def seq_idty():
439 + """Computes identity matrices for each of the RNA families.
440 +
441 + REQUIRES temporary results files in data/*.npy
442 + REQUIRES tables chain, family un to date."""
443 +
444 + # load distance matrices
445 + fam_arrays = []
446 + for f in famlist:
447 + if path.isfile("data/"+f+".npy"):
448 + fam_arrays.append(np.load("data/"+f+".npy"))
449 + else:
450 + fam_arrays.append([])
451 +
452 + # Update database with identity percentages
453 + conn = sqlite3.connect("results/RNANet.db")
454 + for f, D in zip(famlist, fam_arrays):
455 + if not len(D): continue
456 + a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
457 + conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
458 + conn.commit()
459 + conn.close()
460 +
461 + # Plots plots plots
462 + fig, axs = plt.subplots(4,17, figsize=(17,5.75))
463 + axs = axs.ravel()
464 + [axi.set_axis_off() for axi in axs]
465 + im = "" # Just to declare the variable, it will be set in the loop
466 + for f, D, ax in zip(famlist, fam_arrays, axs):
467 + if not len(D): continue
468 + if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
469 + D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
470 + condensedD = squareform(D)
471 +
472 + # Compute basic dendrogram by Ward's method
473 + Y = sch.linkage(condensedD, method='ward')
474 + Z = sch.dendrogram(Y, orientation='left', no_plot=True)
475 +
476 + # Reorganize rows and cols
477 + idx1 = Z['leaves']
478 + D = D[idx1,:]
479 + D = D[:,idx1[::-1]]
480 + im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
481 + ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
482 + fig.tight_layout()
483 + fig.subplots_adjust(wspace=0.1, hspace=0.3)
484 + fig.colorbar(im, ax=axs[-1], shrink=0.8)
485 + fig.savefig(f"results/figures/distances.png")
486 + notify("Computed all identity matrices and saved the figure.")
355 487
356 def stats_pairs(): 488 def stats_pairs():
357 """Counts occurrences of intra-chain base-pair types in RNA families 489 """Counts occurrences of intra-chain base-pair types in RNA families
...@@ -363,26 +495,15 @@ def stats_pairs(): ...@@ -363,26 +495,15 @@ def stats_pairs():
363 return family_data.apply(partial(format_percentage, sum(family_data))) 495 return family_data.apply(partial(format_percentage, sum(family_data)))
364 496
365 if not path.isfile("data/pair_counts.csv"): 497 if not path.isfile("data/pair_counts.csv"):
366 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=read_cpu_number(), maxtasksperchild=5) 498 + results = []
367 - try: 499 + allpairs = []
368 - fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) 500 + for f in fam_list:
369 - results = [] 501 + newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
370 - allpairs = [] 502 + fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
371 - for _, newp_famdf in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)): 503 + results.append(fam_df)
372 - newpairs, fam_df = newp_famdf 504 + allpairs.append(newpairs)
373 - fam_pbar.update(1) 505 + subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
374 - results.append(fam_df) 506 + subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
375 - allpairs.append(newpairs)
376 - fam_pbar.close()
377 - p.close()
378 - p.join()
379 - except KeyboardInterrupt:
380 - warn("KeyboardInterrupt, terminating workers.", error=True)
381 - fam_pbar.close()
382 - p.terminate()
383 - p.join()
384 - exit(1)
385 -
386 all_pairs = pd.concat(allpairs) 507 all_pairs = pd.concat(allpairs)
387 df = pd.concat(results).fillna(0) 508 df = pd.concat(results).fillna(0)
388 df.to_csv("data/pair_counts.csv") 509 df.to_csv("data/pair_counts.csv")
...@@ -431,92 +552,12 @@ def stats_pairs(): ...@@ -431,92 +552,12 @@ def stats_pairs():
431 552
432 notify("Computed nucleotide statistics and saved CSV and PNG file.") 553 notify("Computed nucleotide statistics and saved CSV and PNG file.")
433 554
434 -def to_dist_matrix(f):
435 - if path.isfile("data/"+f+".npy"):
436 - notify(f"Computed {f} distance matrix", "loaded from file")
437 - return 0
438 -
439 - notify(f"Computing {f} distance matrix from alignment...")
440 - dm = DistanceCalculator('identity')
441 - with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
442 - al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
443 - idty = dm.get_distance(al).matrix # list of lists
444 - del al
445 - l = len(idty)
446 - np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ], dtype=object))
447 - del idty
448 - notify(f"Computed {f} distance matrix")
449 - return 0
450 -
451 -def seq_idty():
452 - """Computes identity matrices for each of the RNA families.
453 -
454 - Creates temporary results files in data/*.npy
455 - REQUIRES tables chain, family un to date."""
456 -
457 - # List the families for which we will compute sequence identity matrices
458 - conn = sqlite3.connect("results/RNANet.db")
459 - famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ]
460 - ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
461 - if len(ignored):
462 - print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
463 -
464 - # compute distance matrices (or ignore if data/RF0****.npy exists)
465 - p = Pool(processes=8)
466 - p.map(to_dist_matrix, famlist)
467 - p.close()
468 - p.join()
469 -
470 - # load them
471 - fam_arrays = []
472 - for f in famlist:
473 - if path.isfile("data/"+f+".npy"):
474 - fam_arrays.append(np.load("data/"+f+".npy"))
475 - else:
476 - fam_arrays.append([])
477 -
478 - # Update database with identity percentages
479 - conn = sqlite3.connect("results/RNANet.db")
480 - for f, D in zip(famlist, fam_arrays):
481 - if not len(D): continue
482 - a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
483 - conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
484 - conn.commit()
485 - conn.close()
486 -
487 - # Plots plots plots
488 - fig, axs = plt.subplots(4,17, figsize=(17,5.75))
489 - axs = axs.ravel()
490 - [axi.set_axis_off() for axi in axs]
491 - im = "" # Just to declare the variable, it will be set in the loop
492 - for f, D, ax in zip(famlist, fam_arrays, axs):
493 - if not len(D): continue
494 - if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
495 - D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
496 - condensedD = squareform(D)
497 -
498 - # Compute basic dendrogram by Ward's method
499 - Y = sch.linkage(condensedD, method='ward')
500 - Z = sch.dendrogram(Y, orientation='left', no_plot=True)
501 -
502 - # Reorganize rows and cols
503 - idx1 = Z['leaves']
504 - D = D[idx1,:]
505 - D = D[:,idx1[::-1]]
506 - im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
507 - ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
508 - fig.tight_layout()
509 - fig.subplots_adjust(wspace=0.1, hspace=0.3)
510 - fig.colorbar(im, ax=axs[-1], shrink=0.8)
511 - fig.savefig(f"results/figures/distances.png")
512 - notify("Computed all identity matrices and saved the figure.")
513 -
514 def per_chain_stats(): 555 def per_chain_stats():
515 """Computes per-chain frequencies and base-pair type counts. 556 """Computes per-chain frequencies and base-pair type counts.
516 557
517 REQUIRES tables chain, nucleotide up to date. """ 558 REQUIRES tables chain, nucleotide up to date. """
518 559
519 - with sqlite3.connect("results/RNANet.db") as conn: 560 + with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
520 # Compute per-chain nucleotide frequencies 561 # Compute per-chain nucleotide frequencies
521 df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) 562 df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
522 df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) 563 df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
...@@ -524,39 +565,74 @@ def per_chain_stats(): ...@@ -524,39 +565,74 @@ def per_chain_stats():
524 df = df.drop("total", axis=1) 565 df = df.drop("total", axis=1)
525 566
526 # Set the values 567 # Set the values
568 + conn.execute('pragma journal_mode=wal')
527 sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", 569 sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
528 many=True, data=list(df.to_records(index=False)), warn_every=10) 570 many=True, data=list(df.to_records(index=False)), warn_every=10)
529 notify("Updated the database with per-chain base frequencies") 571 notify("Updated the database with per-chain base frequencies")
530 572
573 +def log_to_pbar(pbar):
574 + def update(r):
575 + pbar.update(1)
576 + return update
577 +
531 if __name__ == "__main__": 578 if __name__ == "__main__":
532 579
533 os.makedirs("results/figures/wadley_plots/", exist_ok=True) 580 os.makedirs("results/figures/wadley_plots/", exist_ok=True)
534 581
535 print("Loading mappings list...") 582 print("Loading mappings list...")
536 - conn = sqlite3.connect("results/RNANet.db") 583 + with sqlite3.connect("results/RNANet.db") as conn:
537 - fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ] 584 + fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
538 - mappings_list = {} 585 + mappings_list = {}
539 - for k in fam_list: 586 + for k in fam_list:
540 - mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ] 587 + mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
541 - conn.close()
542 -
543 - # stats_pairs()
544 -
545 - # Define threads for the tasks
546 - threads = [
547 - th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
548 - th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
549 - th.Thread(target=stats_len), # computes figures
550 - th.Thread(target=stats_freq), # Updates the database
551 - th.Thread(target=seq_idty), # produces .npy files and seq idty figures
552 - th.Thread(target=per_chain_stats) # Updates the database
553 - ]
554 -
555 - # Start the threads
556 - for t in threads:
557 - t.start()
558 -
559 - # Wait for the threads to complete
560 - for t in threads:
561 - t.join()
562 588
589 + # List the families for which we will compute sequence identity matrices
590 + with sqlite3.connect("results/RNANet.db") as conn:
591 + famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
592 + ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
593 + if len(ignored):
594 + print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
595 +
596 + # Prepare the multiprocessing execution environment
597 + nworkers = max(read_cpu_number()-1, 32)
598 + thr_idx_mgr = Manager()
599 + idxQueue = thr_idx_mgr.Queue()
600 + for i in range(nworkers):
601 + idxQueue.put(i)
602 +
603 + # Define the tasks
604 + joblist = []
605 + joblist.append(Job(function=reproduce_wadley_results, args=(1,)))
606 + joblist.append(Job(function=reproduce_wadley_results, args=(4,)))
607 + joblist.append(Job(function=stats_len)) # Computes figures
608 + joblist.append(Job(function=stats_freq)) # updates the database
609 + for f in famlist:
610 + joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
611 + if f not in ignored:
612 + joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
613 +
614 + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
615 + pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
616 +sqlite3
617 + try:
618 + for j in joblist:
619 + p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar))
620 + p.close()
621 + p.join()
622 + pbar.close()
623 + except KeyboardInterrupt:
624 + warn("KeyboardInterrupt, terminating workers.", error=True)
625 + p.terminate()
626 + p.join()
627 + pbar.close()
628 + exit(1)
629 + except:
630 + print("Something went wrong")
631 +
632 + print()
633 + print()
634 +
635 + # finish the work after the parallel portions
636 + per_chain_stats()
637 + seq_idty()
638 + stats_pairs()
......