Merge branch 'master' of https://github.com/persalteas/RNANet into master
Showing
7 changed files
with
992 additions
and
214 deletions
... | @@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n | ... | @@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n |
7 | This script prepares the dataset from available public data in PDB and Rfam. | 7 | This script prepares the dataset from available public data in PDB and Rfam. |
8 | 8 | ||
9 | 9 | ||
10 | -**Please cite**: *Coming soon, expect it summer 2020* | 10 | +**Please cite**: *Coming soon, expect it in 2021* |
11 | 11 | ||
12 | # What it does | 12 | # What it does |
13 | The script follows these steps: | 13 | The script follows these steps: |
... | @@ -72,7 +72,7 @@ You need to install: | ... | @@ -72,7 +72,7 @@ You need to install: |
72 | 72 | ||
73 | ## Command line | 73 | ## Command line |
74 | Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`. | 74 | Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`. |
75 | -It requires solid hardware to run. It takes around 15 hours the first time, and 9h then, tested on a server with 32 cores and 48GB of RAM. | 75 | +It requires solid hardware to run. It takes around around 12 to 15 hours the first time, and 1 to 3h then, tested on a server with 32 cores and 48GB of RAM. |
76 | The detailed list of options is below: | 76 | The detailed list of options is below: |
77 | 77 | ||
78 | ``` | 78 | ``` | ... | ... |
... | @@ -273,32 +273,39 @@ class Chain: | ... | @@ -273,32 +273,39 @@ class Chain: |
273 | if self.mapping is not None: | 273 | if self.mapping is not None: |
274 | self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}") | 274 | self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}") |
275 | 275 | ||
276 | - if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]: | 276 | + try: |
277 | - # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end | 277 | + if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]: |
278 | - | 278 | + # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end |
279 | - if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup: | 279 | + |
280 | - # They are all contiguous in the chain | 280 | + if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup: |
281 | - # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618... | 281 | + # They are all contiguous in the chain |
282 | - # there is a redundancy (611) followed by a gap (611-617). | 282 | + # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618... |
283 | - # We want the redundancy to fill the gap. | 283 | + # there is a redundancy (611) followed by a gap (611-617). |
284 | - df.iloc[i:i+n_dup-1, 1] += 1 | 284 | + # We want the redundancy to fill the gap. |
285 | + df.iloc[i:i+n_dup-1, 1] += 1 | ||
286 | + else: | ||
287 | + # We solve the problem continous component by continuous component | ||
288 | + for j in range(1, n_dup+1): | ||
289 | + if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous | ||
290 | + df.iloc[i+j-1,1] += 1 | ||
291 | + else: | ||
292 | + break | ||
293 | + elif df.iloc[i,1] == df.iloc[i-1,1]: | ||
294 | + # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. | ||
295 | + # Solution : we shift the numbering of 17A (to 18) and the following residues. | ||
296 | + df.iloc[i:, 1] += 1 | ||
285 | else: | 297 | else: |
286 | - # We solve the problem continous component by continuous component | 298 | + # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... |
287 | - for j in range(1, n_dup+1): | 299 | + # Here the 163B is read 163 by DSSR, but there already is a residue 163. |
288 | - if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous | 300 | + # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1. |
289 | - df.iloc[i+j-1,1] += 1 | 301 | + df.iloc[i, 1] = 1 + df.iloc[i-1, 1] |
290 | - else: | 302 | + df.iloc[i+1:, 1] += 1 |
291 | - break | 303 | + except: |
292 | - elif df.iloc[i,1] == df.iloc[i-1,1]: | 304 | + warn(f"Error with parsing of {self.chain_label} duplicate residue numbers. Ignoring it.") |
293 | - # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. | 305 | + self.delete_me = True |
294 | - # Solution : we shift the numbering of 17A (to 18) and the following residues. | 306 | + self.error_messages = f"Error with parsing of duplicate residues numbers." |
295 | - df.iloc[i:, 1] += 1 | 307 | + return None |
296 | - else: | 308 | + |
297 | - # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... | ||
298 | - # Here the 163B is read 163 by DSSR, but there already is a residue 163. | ||
299 | - # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1. | ||
300 | - df.iloc[i, 1] = 1 + df.iloc[i-1, 1] | ||
301 | - df.iloc[i+1:, 1] += 1 | ||
302 | 309 | ||
303 | # Search for ligands at the end of the selection | 310 | # Search for ligands at the end of the selection |
304 | # Drop ligands detected as residues by DSSR, by detecting several markers | 311 | # Drop ligands detected as residues by DSSR, by detecting several markers |
... | @@ -1019,7 +1026,7 @@ class Pipeline: | ... | @@ -1019,7 +1026,7 @@ class Pipeline: |
1019 | print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") | 1026 | print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") |
1020 | sys.exit() | 1027 | sys.exit() |
1021 | elif opt == '--version': | 1028 | elif opt == '--version': |
1022 | - print("RNANet 1.0 alpha ") | 1029 | + print("RNANet 1.1 beta") |
1023 | sys.exit() | 1030 | sys.exit() |
1024 | elif opt == "-r" or opt == "--resolution": | 1031 | elif opt == "-r" or opt == "--resolution": |
1025 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1032 | assert float(arg) > 0.0 and float(arg) <= 20.0 |
... | @@ -1382,7 +1389,7 @@ class Pipeline: | ... | @@ -1382,7 +1389,7 @@ class Pipeline: |
1382 | # Remove previous precomputed data | 1389 | # Remove previous precomputed data |
1383 | subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) | 1390 | subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) |
1384 | for f in self.fam_list: | 1391 | for f in self.fam_list: |
1385 | - subprocess.run(["rm","-f", f"data/{f}.npy"]) | 1392 | + subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"]) |
1386 | 1393 | ||
1387 | # Run statistics files | 1394 | # Run statistics files |
1388 | os.chdir(runDir) | 1395 | os.chdir(runDir) |
... | @@ -1390,13 +1397,12 @@ class Pipeline: | ... | @@ -1390,13 +1397,12 @@ class Pipeline: |
1390 | subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data]) | 1397 | subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data]) |
1391 | 1398 | ||
1392 | # Save additional informations | 1399 | # Save additional informations |
1393 | - conn = sqlite3.connect(runDir+"/results/RNANet.db") | 1400 | + with sqlite3.connect(runDir+"/results/RNANet.db") as conn: |
1394 | - pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", | 1401 | + pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", |
1395 | - conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) | 1402 | + conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) |
1396 | - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure | 1403 | + pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure |
1397 | - JOIN chain ON structure.pdb_id = chain.structure_id | 1404 | + JOIN chain ON structure.pdb_id = chain.structure_id |
1398 | - ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) | 1405 | + ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) |
1399 | - conn.close() | ||
1400 | 1406 | ||
1401 | # Archive the results | 1407 | # Archive the results |
1402 | if self.SELECT_ONLY is None: | 1408 | if self.SELECT_ONLY is None: |
... | @@ -1404,7 +1410,10 @@ class Pipeline: | ... | @@ -1404,7 +1410,10 @@ class Pipeline: |
1404 | subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) | 1410 | subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) |
1405 | 1411 | ||
1406 | # Update shortcuts to latest versions | 1412 | # Update shortcuts to latest versions |
1407 | - subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"]) | 1413 | + subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", |
1414 | + runDir + "/results/summary_latest.csv", | ||
1415 | + runDir + "/results/families_latest.csv" | ||
1416 | + ]) | ||
1408 | subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"]) | 1417 | subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"]) |
1409 | subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"]) | 1418 | subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"]) |
1410 | subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"]) | 1419 | subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"]) |
... | @@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10): | ... | @@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10): |
1631 | 1640 | ||
1632 | @trace_unhandled_exceptions | 1641 | @trace_unhandled_exceptions |
1633 | def sql_execute(conn, sql, many=False, data=None, warn_every=10): | 1642 | def sql_execute(conn, sql, many=False, data=None, warn_every=10): |
1643 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
1634 | for _ in range(100): # retry 100 times if it fails | 1644 | for _ in range(100): # retry 100 times if it fails |
1635 | try: | 1645 | try: |
1636 | if many: | 1646 | if many: |
... | @@ -2397,6 +2407,7 @@ if __name__ == "__main__": | ... | @@ -2397,6 +2407,7 @@ if __name__ == "__main__": |
2397 | rfam_acc_to_download[c.mapping.rfam_acc] = [ c ] | 2407 | rfam_acc_to_download[c.mapping.rfam_acc] = [ c ] |
2398 | else: | 2408 | else: |
2399 | rfam_acc_to_download[c.mapping.rfam_acc].append(c) | 2409 | rfam_acc_to_download[c.mapping.rfam_acc].append(c) |
2410 | + | ||
2400 | print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") | 2411 | print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") |
2401 | pp.fam_list = sorted(rfam_acc_to_download.keys()) | 2412 | pp.fam_list = sorted(rfam_acc_to_download.keys()) |
2402 | 2413 | ... | ... |
automate.sh
100644 → 100755
1 | # This is a script supposed to be run periodically as a cron job | 1 | # This is a script supposed to be run periodically as a cron job |
2 | 2 | ||
3 | +cd /home/lbecquey/Projects/RNANet | ||
4 | +rm -f latest_run.log errors.txt | ||
5 | + | ||
3 | # Run RNANet | 6 | # Run RNANet |
4 | -cd /home/lbecquey/Projects/RNANet; | 7 | +bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log |
5 | -rm -f stdout.txt stderr.txt errors.txt; | 8 | +touch results/RNANet.db # update last modification date |
6 | -time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt; | 9 | +rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files |
7 | 10 | ||
8 | -# Sync in Seafile | 11 | +# Compress |
9 | -seaf-cli start; | 12 | +rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz |
13 | +echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log | ||
14 | +gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db | ||
15 | +echo 'Recreated it.' >> latest_run.log | ||
10 | 16 | ||
11 | -seaf-cli stop; | 17 | +# Sync in Seafile |
18 | +seaf-cli start >> latest_run.log 2>&1 | ||
19 | +echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log | ||
20 | +sleep 10m | ||
21 | +echo `seaf-cli status` >> latest_run.log | ||
22 | +seaf-cli stop >> latest_run.log 2>&1 | ||
23 | +echo 'We are '`date`', update completed.' >> latest_run.log | ||
12 | 24 | ... | ... |
... | @@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do | ... | @@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do |
10 | fi | 10 | fi |
11 | done | 11 | done |
12 | 12 | ||
13 | +PROCESS_TO_KILL="statistics.py" | ||
14 | +PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'` | ||
15 | +KILLED= | ||
16 | +for KILLPID in $PROCESS_LIST; do | ||
17 | + if [ ! -z $KILLPID ];then | ||
18 | + kill -9 $KILLPID | ||
19 | + echo "Killed PID ${KILLPID}" | ||
20 | + KILLED=yes | ||
21 | + fi | ||
22 | +done | ||
23 | + | ||
13 | if [ -z $KILLED ];then | 24 | if [ -z $KILLED ];then |
14 | echo "Didn't kill anything" | 25 | echo "Didn't kill anything" |
15 | fi | 26 | fi | ... | ... |
known_issues.txt
0 → 100644
1 | +1ml5_1_a_1-2914 | ||
2 | +1ml5_1_a_151-2903 | ||
3 | +1ml5_1_A_7-1518 | ||
4 | +1ml5_1_A_7-1515 | ||
5 | +1ml5_1_A_2-1520 | ||
6 | +1ml5_1_b_5-121 | ||
7 | +2rdo_1_A_3-118 | ||
8 | +4v48_1_A9_3-118 | ||
9 | +4v47_1_A9_3-118 | ||
10 | +1vy7_1_AY_1-73 | ||
11 | +1vy7_1_CY_1-73 | ||
12 | +4w2h_1_CY_1-73 | ||
13 | +6zmi_1_L8_1267-4755 | ||
14 | +6zm7_1_L8_1267-4755 | ||
15 | +6y6x_1_L8_1267-4755 | ||
16 | +6z6n_1_L8_1267-4755 | ||
17 | +6qzp_1_L8_1267-4755 | ||
18 | +6zme_1_L8_1267-4755 | ||
19 | +6z6l_1_L8_1267-4755 | ||
20 | +6ek0_1_L8_1267-4755 | ||
21 | +6zmo_1_L8_1267-4755 | ||
22 | +6z6m_1_L8_1267-4755 | ||
23 | +6ole_1_D_1267-4755 | ||
24 | +6om0_1_D_1267-4755 | ||
25 | +6y2l_1_L8_1267-4755 | ||
26 | +6y0g_1_L8_1267-4755 | ||
27 | +6oli_1_D_1267-4755 | ||
28 | +6olg_1_A3_1267-4755 | ||
29 | +6y57_1_L8_1267-4755 | ||
30 | +5t2c_1_C_1267-4755 | ||
31 | +6om7_1_D_1267-4755 | ||
32 | +4ug0_1_L8_1267-4755 | ||
33 | +6olf_1_D_1267-4755 | ||
34 | +6ip5_1_1C_1267-4755 | ||
35 | +6ip8_1_1C_1267-4755 | ||
36 | +6olz_1_A3_1267-4755 | ||
37 | +5aj0_1_A3_1267-4755 | ||
38 | +5lks_1_L8_1267-4755 | ||
39 | +6ip6_1_1C_1267-4755 | ||
40 | +4v6x_1_A8_1267-4755 | ||
41 | +2z9q_1_A_1-72 | ||
42 | +1ls2_1_B_1-73 | ||
43 | +3ep2_1_Y_1-72 | ||
44 | +3eq3_1_Y_1-72 | ||
45 | +4v48_1_A6_1-73 | ||
46 | +1gsg_1_T_1-72 | ||
47 | +3jcr_1_H_1-115 | ||
48 | +1eg0_1_O_1-73 | ||
49 | +4v42_1_BB_5-121 | ||
50 | +4v42_1_BA_1-2914 | ||
51 | +4v42_1_BA_151-2903 | ||
52 | +2ob7_1_A_10-319 | ||
53 | +1x1l_1_A_1-130 | ||
54 | +1zc8_1_Z_1-130 | ||
55 | +1zc8_1_Z_1-91 | ||
56 | +2ob7_1_D_1-130 | ||
57 | +1r2x_1_C_1-58 | ||
58 | +1r2w_1_C_1-58 | ||
59 | +1eg0_1_L_1-56 | ||
60 | +1eg0_1_L_1-57 | ||
61 | +6rxu_1_C2_588-2386 | ||
62 | +6rxu_1_C2_588-2383 | ||
63 | +6rxu_1_C2_583-2388 | ||
64 | +5oql_1_2_588-2386 | ||
65 | +5oql_1_2_588-2383 | ||
66 | +5oql_1_2_583-2388 | ||
67 | +6rxv_1_C2_588-2386 | ||
68 | +6rxv_1_C2_588-2383 | ||
69 | +6rxv_1_C2_583-2388 | ||
70 | +6rxz_1_C2_588-2386 | ||
71 | +6rxz_1_C2_588-2383 | ||
72 | +6rxz_1_C2_583-2388 | ||
73 | +6rxy_1_C2_588-2386 | ||
74 | +6rxy_1_C2_588-2383 | ||
75 | +6rxy_1_C2_583-2388 | ||
76 | +6rxt_1_C2_588-2386 | ||
77 | +6rxt_1_C2_588-2383 | ||
78 | +6rxt_1_C2_583-2388 | ||
79 | +4v48_1_BA_1-91 | ||
80 | +4v48_1_BA_6-1541 | ||
81 | +4v48_1_BA_6-1538 | ||
82 | +4v48_1_BA_1-1543 | ||
83 | +4v47_1_BA_1-91 | ||
84 | +4v47_1_BA_6-1540 | ||
85 | +4v47_1_BA_6-1537 | ||
86 | +4v47_1_BA_1-1542 | ||
87 | +2rdo_1_B_6-1460 | ||
88 | +2rdo_1_B_6-1522 | ||
89 | +2rdo_1_B_1-2903 | ||
90 | +2rdo_1_B_6-1457 | ||
91 | +2rdo_1_B_1-2904 | ||
92 | +2rdo_1_B_1-1528 | ||
93 | +2rdo_1_B_160-2893 | ||
94 | +4v48_1_A0_6-1460 | ||
95 | +4v48_1_A0_6-1522 | ||
96 | +4v48_1_A0_1-2903 | ||
97 | +4v48_1_A0_6-1457 | ||
98 | +4v48_1_A0_1-2904 | ||
99 | +4v48_1_A0_1-1528 | ||
100 | +4v48_1_A0_160-2893 | ||
101 | +4v47_1_A0_6-1460 | ||
102 | +4v47_1_A0_6-1522 | ||
103 | +4v47_1_A0_1-2903 | ||
104 | +4v47_1_A0_6-1457 | ||
105 | +4v47_1_A0_1-2904 | ||
106 | +4v47_1_A0_1-1528 | ||
107 | +4v47_1_A0_160-2893 | ||
108 | +1zc8_1_A_1-59 | ||
109 | +1mvr_1_D_1-59 | ||
110 | +4c9d_1_D_29-1 | ||
111 | +4c9d_1_C_29-1 | ||
112 | +4adx_1_9_1-121 | ||
113 | +1zn1_1_B_1-59 | ||
114 | +1emi_1_B_1-108 | ||
115 | +3iy9_1_A_498-1027 | ||
116 | +1jgq_1_A_20-55 | ||
117 | +1jgq_1_A_7-1518 | ||
118 | +1jgq_1_A_7-1515 | ||
119 | +1jgq_1_A_2-1520 | ||
120 | +4v42_1_AA_20-55 | ||
121 | +4v42_1_AA_7-1518 | ||
122 | +4v42_1_AA_7-1515 | ||
123 | +4v42_1_AA_2-1520 | ||
124 | +1jgo_1_A_20-55 | ||
125 | +1jgo_1_A_7-1518 | ||
126 | +1jgo_1_A_7-1515 | ||
127 | +1jgo_1_A_2-1520 | ||
128 | +1jgp_1_A_20-55 | ||
129 | +1jgp_1_A_7-1518 | ||
130 | +1jgp_1_A_7-1515 | ||
131 | +1jgp_1_A_2-1520 | ||
132 | +3ep2_1_B_1-50 | ||
133 | +3eq3_1_B_1-50 | ||
134 | +3eq4_1_B_1-50 | ||
135 | +3pgw_1_R_1-164 | ||
136 | +3pgw_1_N_1-164 | ||
137 | +3cw1_1_x_1-138 | ||
138 | +3cw1_1_w_1-138 | ||
139 | +3cw1_1_V_1-138 | ||
140 | +3cw1_1_v_1-138 | ||
141 | +2iy3_1_B_9-105 | ||
142 | +3jcr_1_N_1-106 | ||
143 | +3jcr_1_N_1-188 | ||
144 | +2vaz_1_A_64-177 | ||
145 | +2ftc_1_R_81-1466 | ||
146 | +2ftc_1_R_1-1568 | ||
147 | +2ftc_1_R_792-1568 | ||
148 | +3jcr_1_M_1-141 | ||
149 | +3jcr_1_M_1-107 | ||
150 | +3jcr_1_M_1-188 | ||
151 | +4v5z_1_B0_1-2840 | ||
152 | +4v5z_1_B0_1-2899 | ||
153 | +4v5z_1_B0_1-2902 | ||
154 | +5g2x_1_A_595-692 | ||
155 | +3iy8_1_A_1-540 | ||
156 | +4v5z_1_BY_2-113 | ||
157 | +4v5z_1_BZ_1-70 | ||
158 | +1mvr_1_B_1-96 | ||
159 | +4adx_1_0_1-2923 | ||
160 | +4adx_1_0_132-2915 | ||
161 | +3eq4_1_Y_1-69 | ||
162 | +4v5z_1_AA_1-1562 | ||
163 | +4v5z_1_AA_1-1563 | ||
164 | +6lqm_1_8_1267-4755 | ||
165 | +6lu8_1_8_1267-4755 | ||
166 | +6lsr_1_8_1267-4755 | ||
167 | +6lss_1_8_1267-4755 |
known_issues_reasons.txt
0 → 100644
1 | +1ml5_1_a_1-2914 | ||
2 | +Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
3 | + | ||
4 | +1ml5_1_a_151-2903 | ||
5 | +Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
6 | + | ||
7 | +1ml5_1_A_7-1518 | ||
8 | +Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
9 | + | ||
10 | +1ml5_1_A_7-1515 | ||
11 | +Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
12 | + | ||
13 | +1ml5_1_A_2-1520 | ||
14 | +Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
15 | + | ||
16 | +1ml5_1_b_5-121 | ||
17 | +Could not find nucleotides of chain b in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
18 | + | ||
19 | +2rdo_1_A_3-118 | ||
20 | +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_A_3-118. | ||
21 | + | ||
22 | +4v48_1_A9_3-118 | ||
23 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A9_3-118. | ||
24 | + | ||
25 | +4v47_1_A9_3-118 | ||
26 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A9_3-118. | ||
27 | + | ||
28 | +1vy7_1_AY_1-73 | ||
29 | +Sequence is too short. (< 5 resolved nts) | ||
30 | + | ||
31 | +1vy7_1_CY_1-73 | ||
32 | +Sequence is too short. (< 5 resolved nts) | ||
33 | + | ||
34 | +4w2h_1_CY_1-73 | ||
35 | +Sequence is too short. (< 5 resolved nts) | ||
36 | + | ||
37 | +6zmi_1_L8_1267-4755 | ||
38 | +Could not find nucleotides of chain L8 in annotation 6zmi.json. Either there is a problem with 6zmi mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
39 | + | ||
40 | +6zm7_1_L8_1267-4755 | ||
41 | +Could not find nucleotides of chain L8 in annotation 6zm7.json. Either there is a problem with 6zm7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
42 | + | ||
43 | +6y6x_1_L8_1267-4755 | ||
44 | +Could not find nucleotides of chain L8 in annotation 6y6x.json. Either there is a problem with 6y6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
45 | + | ||
46 | +6z6n_1_L8_1267-4755 | ||
47 | +Could not find nucleotides of chain L8 in annotation 6z6n.json. Either there is a problem with 6z6n mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
48 | + | ||
49 | +6qzp_1_L8_1267-4755 | ||
50 | +Could not find nucleotides of chain L8 in annotation 6qzp.json. Either there is a problem with 6qzp mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
51 | + | ||
52 | +6zme_1_L8_1267-4755 | ||
53 | +Could not find nucleotides of chain L8 in annotation 6zme.json. Either there is a problem with 6zme mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
54 | + | ||
55 | +6z6l_1_L8_1267-4755 | ||
56 | +Could not find nucleotides of chain L8 in annotation 6z6l.json. Either there is a problem with 6z6l mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
57 | + | ||
58 | +6ek0_1_L8_1267-4755 | ||
59 | +Could not find nucleotides of chain L8 in annotation 6ek0.json. Either there is a problem with 6ek0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
60 | + | ||
61 | +6zmo_1_L8_1267-4755 | ||
62 | +Could not find nucleotides of chain L8 in annotation 6zmo.json. Either there is a problem with 6zmo mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
63 | + | ||
64 | +6z6m_1_L8_1267-4755 | ||
65 | +Could not find nucleotides of chain L8 in annotation 6z6m.json. Either there is a problem with 6z6m mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
66 | + | ||
67 | +6ole_1_D_1267-4755 | ||
68 | +Could not find nucleotides of chain D in annotation 6ole.json. Either there is a problem with 6ole mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
69 | + | ||
70 | +6om0_1_D_1267-4755 | ||
71 | +Could not find nucleotides of chain D in annotation 6om0.json. Either there is a problem with 6om0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
72 | + | ||
73 | +6y2l_1_L8_1267-4755 | ||
74 | +Could not find nucleotides of chain L8 in annotation 6y2l.json. Either there is a problem with 6y2l mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
75 | + | ||
76 | +6y0g_1_L8_1267-4755 | ||
77 | +Could not find nucleotides of chain L8 in annotation 6y0g.json. Either there is a problem with 6y0g mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
78 | + | ||
79 | +6oli_1_D_1267-4755 | ||
80 | +Could not find nucleotides of chain D in annotation 6oli.json. Either there is a problem with 6oli mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
81 | + | ||
82 | +6olg_1_A3_1267-4755 | ||
83 | +Could not find nucleotides of chain A3 in annotation 6olg.json. Either there is a problem with 6olg mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
84 | + | ||
85 | +6y57_1_L8_1267-4755 | ||
86 | +Could not find nucleotides of chain L8 in annotation 6y57.json. Either there is a problem with 6y57 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
87 | + | ||
88 | +5t2c_1_C_1267-4755 | ||
89 | +Could not find nucleotides of chain C in annotation 5t2c.json. Either there is a problem with 5t2c mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
90 | + | ||
91 | +6om7_1_D_1267-4755 | ||
92 | +Could not find nucleotides of chain D in annotation 6om7.json. Either there is a problem with 6om7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
93 | + | ||
94 | +4ug0_1_L8_1267-4755 | ||
95 | +Could not find nucleotides of chain L8 in annotation 4ug0.json. Either there is a problem with 4ug0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
96 | + | ||
97 | +6olf_1_D_1267-4755 | ||
98 | +Could not find nucleotides of chain D in annotation 6olf.json. Either there is a problem with 6olf mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
99 | + | ||
100 | +6ip5_1_1C_1267-4755 | ||
101 | +Could not find nucleotides of chain 1C in annotation 6ip5.json. Either there is a problem with 6ip5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
102 | + | ||
103 | +6ip8_1_1C_1267-4755 | ||
104 | +Could not find nucleotides of chain 1C in annotation 6ip8.json. Either there is a problem with 6ip8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
105 | + | ||
106 | +6olz_1_A3_1267-4755 | ||
107 | +Could not find nucleotides of chain A3 in annotation 6olz.json. Either there is a problem with 6olz mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
108 | + | ||
109 | +5aj0_1_A3_1267-4755 | ||
110 | +Could not find nucleotides of chain A3 in annotation 5aj0.json. Either there is a problem with 5aj0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
111 | + | ||
112 | +5lks_1_L8_1267-4755 | ||
113 | +Could not find nucleotides of chain L8 in annotation 5lks.json. Either there is a problem with 5lks mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
114 | + | ||
115 | +6ip6_1_1C_1267-4755 | ||
116 | +Could not find nucleotides of chain 1C in annotation 6ip6.json. Either there is a problem with 6ip6 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
117 | + | ||
118 | +4v6x_1_A8_1267-4755 | ||
119 | +Could not find nucleotides of chain A8 in annotation 4v6x.json. Either there is a problem with 4v6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
120 | + | ||
121 | +2z9q_1_A_1-72 | ||
122 | +DSSR warning 2z9q.json: no nucleotides found. Ignoring 2z9q_1_A_1-72. | ||
123 | + | ||
124 | +1ls2_1_B_1-73 | ||
125 | +DSSR warning 1ls2.json: no nucleotides found. Ignoring 1ls2_1_B_1-73. | ||
126 | + | ||
127 | +3ep2_1_Y_1-72 | ||
128 | +DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_Y_1-72. | ||
129 | + | ||
130 | +3eq3_1_Y_1-72 | ||
131 | +DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_Y_1-72. | ||
132 | + | ||
133 | +4v48_1_A6_1-73 | ||
134 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A6_1-73. | ||
135 | + | ||
136 | +1gsg_1_T_1-72 | ||
137 | +DSSR warning 1gsg.json: no nucleotides found. Ignoring 1gsg_1_T_1-72. | ||
138 | + | ||
139 | +3jcr_1_H_1-115 | ||
140 | +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_H_1-115. | ||
141 | + | ||
142 | +1eg0_1_O_1-73 | ||
143 | +DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_O_1-73. | ||
144 | + | ||
145 | +4v42_1_BB_5-121 | ||
146 | +Could not find nucleotides of chain BB in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
147 | + | ||
148 | +4v42_1_BA_1-2914 | ||
149 | +Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
150 | + | ||
151 | +4v42_1_BA_151-2903 | ||
152 | +Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
153 | + | ||
154 | +2ob7_1_A_10-319 | ||
155 | +DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_A_10-319. | ||
156 | + | ||
157 | +1x1l_1_A_1-130 | ||
158 | +DSSR warning 1x1l.json: no nucleotides found. Ignoring 1x1l_1_A_1-130. | ||
159 | + | ||
160 | +1zc8_1_Z_1-130 | ||
161 | +DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-130. | ||
162 | + | ||
163 | +1zc8_1_Z_1-91 | ||
164 | +DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-91. | ||
165 | + | ||
166 | +2ob7_1_D_1-130 | ||
167 | +DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_D_1-130. | ||
168 | + | ||
169 | +1r2x_1_C_1-58 | ||
170 | +DSSR warning 1r2x.json: no nucleotides found. Ignoring 1r2x_1_C_1-58. | ||
171 | + | ||
172 | +1r2w_1_C_1-58 | ||
173 | +DSSR warning 1r2w.json: no nucleotides found. Ignoring 1r2w_1_C_1-58. | ||
174 | + | ||
175 | +1eg0_1_L_1-56 | ||
176 | +DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-56. | ||
177 | + | ||
178 | +1eg0_1_L_1-57 | ||
179 | +DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-57. | ||
180 | + | ||
181 | +6rxu_1_C2_588-2386 | ||
182 | +Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
183 | + | ||
184 | +6rxu_1_C2_588-2383 | ||
185 | +Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
186 | + | ||
187 | +6rxu_1_C2_583-2388 | ||
188 | +Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
189 | + | ||
190 | +5oql_1_2_588-2386 | ||
191 | +Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
192 | + | ||
193 | +5oql_1_2_588-2383 | ||
194 | +Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
195 | + | ||
196 | +5oql_1_2_583-2388 | ||
197 | +Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
198 | + | ||
199 | +6rxv_1_C2_588-2386 | ||
200 | +Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
201 | + | ||
202 | +6rxv_1_C2_588-2383 | ||
203 | +Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
204 | + | ||
205 | +6rxv_1_C2_583-2388 | ||
206 | +Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
207 | + | ||
208 | +6rxz_1_C2_588-2386 | ||
209 | +Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
210 | + | ||
211 | +6rxz_1_C2_588-2383 | ||
212 | +Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
213 | + | ||
214 | +6rxz_1_C2_583-2388 | ||
215 | +Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
216 | + | ||
217 | +6rxy_1_C2_588-2386 | ||
218 | +Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
219 | + | ||
220 | +6rxy_1_C2_588-2383 | ||
221 | +Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
222 | + | ||
223 | +6rxy_1_C2_583-2388 | ||
224 | +Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
225 | + | ||
226 | +6rxt_1_C2_588-2386 | ||
227 | +Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
228 | + | ||
229 | +6rxt_1_C2_588-2383 | ||
230 | +Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
231 | + | ||
232 | +6rxt_1_C2_583-2388 | ||
233 | +Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
234 | + | ||
235 | +4v48_1_BA_1-91 | ||
236 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-91. | ||
237 | + | ||
238 | +4v48_1_BA_6-1541 | ||
239 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1541. | ||
240 | + | ||
241 | +4v48_1_BA_6-1538 | ||
242 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1538. | ||
243 | + | ||
244 | +4v48_1_BA_1-1543 | ||
245 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-1543. | ||
246 | + | ||
247 | +4v47_1_BA_1-91 | ||
248 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-91. | ||
249 | + | ||
250 | +4v47_1_BA_6-1540 | ||
251 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1540. | ||
252 | + | ||
253 | +4v47_1_BA_6-1537 | ||
254 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1537. | ||
255 | + | ||
256 | +4v47_1_BA_1-1542 | ||
257 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-1542. | ||
258 | + | ||
259 | +2rdo_1_B_6-1460 | ||
260 | +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1460. | ||
261 | + | ||
262 | +2rdo_1_B_6-1522 | ||
263 | +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1522. | ||
264 | + | ||
265 | +2rdo_1_B_1-2903 | ||
266 | +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2903. | ||
267 | + | ||
268 | +2rdo_1_B_6-1457 | ||
269 | +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1457. | ||
270 | + | ||
271 | +2rdo_1_B_1-2904 | ||
272 | +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2904. | ||
273 | + | ||
274 | +2rdo_1_B_1-1528 | ||
275 | +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-1528. | ||
276 | + | ||
277 | +2rdo_1_B_160-2893 | ||
278 | +DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_160-2893. | ||
279 | + | ||
280 | +4v48_1_A0_6-1460 | ||
281 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1460. | ||
282 | + | ||
283 | +4v48_1_A0_6-1522 | ||
284 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1522. | ||
285 | + | ||
286 | +4v48_1_A0_1-2903 | ||
287 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2903. | ||
288 | + | ||
289 | +4v48_1_A0_6-1457 | ||
290 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1457. | ||
291 | + | ||
292 | +4v48_1_A0_1-2904 | ||
293 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2904. | ||
294 | + | ||
295 | +4v48_1_A0_1-1528 | ||
296 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-1528. | ||
297 | + | ||
298 | +4v48_1_A0_160-2893 | ||
299 | +DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_160-2893. | ||
300 | + | ||
301 | +4v47_1_A0_6-1460 | ||
302 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1460. | ||
303 | + | ||
304 | +4v47_1_A0_6-1522 | ||
305 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1522. | ||
306 | + | ||
307 | +4v47_1_A0_1-2903 | ||
308 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2903. | ||
309 | + | ||
310 | +4v47_1_A0_6-1457 | ||
311 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1457. | ||
312 | + | ||
313 | +4v47_1_A0_1-2904 | ||
314 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2904. | ||
315 | + | ||
316 | +4v47_1_A0_1-1528 | ||
317 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-1528. | ||
318 | + | ||
319 | +4v47_1_A0_160-2893 | ||
320 | +DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_160-2893. | ||
321 | + | ||
322 | +1zc8_1_A_1-59 | ||
323 | +DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_A_1-59. | ||
324 | + | ||
325 | +1mvr_1_D_1-59 | ||
326 | +DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_D_1-59. | ||
327 | + | ||
328 | +4c9d_1_D_29-1 | ||
329 | +Mapping is reversed, this case is not supported (yet). | ||
330 | + | ||
331 | +4c9d_1_C_29-1 | ||
332 | +Mapping is reversed, this case is not supported (yet). | ||
333 | + | ||
334 | +4adx_1_9_1-121 | ||
335 | +DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_9_1-121. | ||
336 | + | ||
337 | +1zn1_1_B_1-59 | ||
338 | +DSSR warning 1zn1.json: no nucleotides found. Ignoring 1zn1_1_B_1-59. | ||
339 | + | ||
340 | +1emi_1_B_1-108 | ||
341 | +DSSR warning 1emi.json: no nucleotides found. Ignoring 1emi_1_B_1-108. | ||
342 | + | ||
343 | +3iy9_1_A_498-1027 | ||
344 | +DSSR warning 3iy9.json: no nucleotides found. Ignoring 3iy9_1_A_498-1027. | ||
345 | + | ||
346 | +1jgq_1_A_20-55 | ||
347 | +Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
348 | + | ||
349 | +1jgq_1_A_7-1518 | ||
350 | +Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
351 | + | ||
352 | +1jgq_1_A_7-1515 | ||
353 | +Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
354 | + | ||
355 | +1jgq_1_A_2-1520 | ||
356 | +Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
357 | + | ||
358 | +4v42_1_AA_20-55 | ||
359 | +Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
360 | + | ||
361 | +4v42_1_AA_7-1518 | ||
362 | +Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
363 | + | ||
364 | +4v42_1_AA_7-1515 | ||
365 | +Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
366 | + | ||
367 | +4v42_1_AA_2-1520 | ||
368 | +Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
369 | + | ||
370 | +1jgo_1_A_20-55 | ||
371 | +Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
372 | + | ||
373 | +1jgo_1_A_7-1518 | ||
374 | +Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
375 | + | ||
376 | +1jgo_1_A_7-1515 | ||
377 | +Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
378 | + | ||
379 | +1jgo_1_A_2-1520 | ||
380 | +Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
381 | + | ||
382 | +1jgp_1_A_20-55 | ||
383 | +Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
384 | + | ||
385 | +1jgp_1_A_7-1518 | ||
386 | +Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
387 | + | ||
388 | +1jgp_1_A_7-1515 | ||
389 | +Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
390 | + | ||
391 | +1jgp_1_A_2-1520 | ||
392 | +Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
393 | + | ||
394 | +3ep2_1_B_1-50 | ||
395 | +DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_B_1-50. | ||
396 | + | ||
397 | +3eq3_1_B_1-50 | ||
398 | +DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_B_1-50. | ||
399 | + | ||
400 | +3eq4_1_B_1-50 | ||
401 | +DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_B_1-50. | ||
402 | + | ||
403 | +3pgw_1_R_1-164 | ||
404 | +DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R_1-164. | ||
405 | + | ||
406 | +3pgw_1_N_1-164 | ||
407 | +DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N_1-164. | ||
408 | + | ||
409 | +3cw1_1_x_1-138 | ||
410 | +DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_x_1-138. | ||
411 | + | ||
412 | +3cw1_1_w_1-138 | ||
413 | +DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_w_1-138. | ||
414 | + | ||
415 | +3cw1_1_V_1-138 | ||
416 | +DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V_1-138. | ||
417 | + | ||
418 | +3cw1_1_v_1-138 | ||
419 | +DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_v_1-138. | ||
420 | + | ||
421 | +2iy3_1_B_9-105 | ||
422 | +DSSR warning 2iy3.json: no nucleotides found. Ignoring 2iy3_1_B_9-105. | ||
423 | + | ||
424 | +3jcr_1_N_1-106 | ||
425 | +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-106. | ||
426 | + | ||
427 | +3jcr_1_N_1-188 | ||
428 | +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-188. | ||
429 | + | ||
430 | +2vaz_1_A_64-177 | ||
431 | +DSSR warning 2vaz.json: no nucleotides found. Ignoring 2vaz_1_A_64-177. | ||
432 | + | ||
433 | +2ftc_1_R_81-1466 | ||
434 | +DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_81-1466. | ||
435 | + | ||
436 | +2ftc_1_R_1-1568 | ||
437 | +DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_1-1568. | ||
438 | + | ||
439 | +2ftc_1_R_792-1568 | ||
440 | +DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_792-1568. | ||
441 | + | ||
442 | +3jcr_1_M_1-141 | ||
443 | +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-141. | ||
444 | + | ||
445 | +3jcr_1_M_1-107 | ||
446 | +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-107. | ||
447 | + | ||
448 | +3jcr_1_M_1-188 | ||
449 | +DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-188. | ||
450 | + | ||
451 | +4v5z_1_B0_1-2840 | ||
452 | +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2840. | ||
453 | + | ||
454 | +4v5z_1_B0_1-2899 | ||
455 | +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2899. | ||
456 | + | ||
457 | +4v5z_1_B0_1-2902 | ||
458 | +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2902. | ||
459 | + | ||
460 | +5g2x_1_A_595-692 | ||
461 | +Sequence is too short. (< 5 resolved nts) | ||
462 | + | ||
463 | +3iy8_1_A_1-540 | ||
464 | +DSSR warning 3iy8.json: no nucleotides found. Ignoring 3iy8_1_A_1-540. | ||
465 | + | ||
466 | +4v5z_1_BY_2-113 | ||
467 | +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BY_2-113. | ||
468 | + | ||
469 | +4v5z_1_BZ_1-70 | ||
470 | +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BZ_1-70. | ||
471 | + | ||
472 | +1mvr_1_B_1-96 | ||
473 | +DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_B_1-96. | ||
474 | + | ||
475 | +4adx_1_0_1-2923 | ||
476 | +DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_1-2923. | ||
477 | + | ||
478 | +4adx_1_0_132-2915 | ||
479 | +DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_132-2915. | ||
480 | + | ||
481 | +3eq4_1_Y_1-69 | ||
482 | +DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_Y_1-69. | ||
483 | + | ||
484 | +4v5z_1_AA_1-1562 | ||
485 | +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1562. | ||
486 | + | ||
487 | +4v5z_1_AA_1-1563 | ||
488 | +DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1563. | ||
489 | + | ||
490 | +6lqm_1_8_1267-4755 | ||
491 | +Could not find nucleotides of chain 8 in annotation 6lqm.json. Either there is a problem with 6lqm mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
492 | + | ||
493 | +6lu8_1_8_1267-4755 | ||
494 | +Could not find nucleotides of chain 8 in annotation 6lu8.json. Either there is a problem with 6lu8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
495 | + | ||
496 | +6lsr_1_8_1267-4755 | ||
497 | +Could not find nucleotides of chain 8 in annotation 6lsr.json. Either there is a problem with 6lsr mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
498 | + | ||
499 | +6lss_1_8_1267-4755 | ||
500 | +Could not find nucleotides of chain 8 in annotation 6lss.json. Either there is a problem with 6lss mmCIF download, or the bases are not resolved in the structure. Delete it and retry. | ||
501 | + |
... | @@ -5,7 +5,7 @@ | ... | @@ -5,7 +5,7 @@ |
5 | # in the database. | 5 | # in the database. |
6 | # This should be run from the folder where the file is (to access the database with path "results/RNANet.db") | 6 | # This should be run from the folder where the file is (to access the database with path "results/RNANet.db") |
7 | 7 | ||
8 | -import os, pickle, sqlite3, sys | 8 | +import os, pickle, sqlite3, shlex, subprocess, sys |
9 | import numpy as np | 9 | import numpy as np |
10 | import pandas as pd | 10 | import pandas as pd |
11 | import threading as th | 11 | import threading as th |
... | @@ -16,14 +16,13 @@ import matplotlib.patches as mpatches | ... | @@ -16,14 +16,13 @@ import matplotlib.patches as mpatches |
16 | import scipy.cluster.hierarchy as sch | 16 | import scipy.cluster.hierarchy as sch |
17 | from scipy.spatial.distance import squareform | 17 | from scipy.spatial.distance import squareform |
18 | from mpl_toolkits.mplot3d import axes3d | 18 | from mpl_toolkits.mplot3d import axes3d |
19 | -from Bio.Phylo.TreeConstruction import DistanceCalculator | ||
20 | from Bio import AlignIO, SeqIO | 19 | from Bio import AlignIO, SeqIO |
21 | from functools import partial | 20 | from functools import partial |
22 | -from multiprocessing import Pool | 21 | +from multiprocessing import Pool, Manager |
23 | from os import path | 22 | from os import path |
24 | from tqdm import tqdm | 23 | from tqdm import tqdm |
25 | from collections import Counter | 24 | from collections import Counter |
26 | -from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker | 25 | +from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker |
27 | 26 | ||
28 | # This sets the paths | 27 | # This sets the paths |
29 | if len(sys.argv) > 1: | 28 | if len(sys.argv) > 1: |
... | @@ -37,7 +36,7 @@ else: | ... | @@ -37,7 +36,7 @@ else: |
37 | LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 | 36 | LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 |
38 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 | 37 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 |
39 | 38 | ||
40 | -def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | 39 | +def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): |
41 | """ | 40 | """ |
42 | Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. | 41 | Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. |
43 | See Wadley & Pyle (2007) | 42 | See Wadley & Pyle (2007) |
... | @@ -68,6 +67,12 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -68,6 +67,12 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
68 | 67 | ||
69 | 68 | ||
70 | if not path.isfile(f"data/wadley_kernel_{angle}.npz"): | 69 | if not path.isfile(f"data/wadley_kernel_{angle}.npz"): |
70 | + | ||
71 | + # Get a worker number to position the progress bar | ||
72 | + global idxQueue | ||
73 | + thr_idx = idxQueue.get() | ||
74 | + pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False) | ||
75 | + | ||
71 | # Extract the angle values of c2'-endo and c3'-endo nucleotides | 76 | # Extract the angle values of c2'-endo and c3'-endo nucleotides |
72 | with sqlite3.connect("results/RNANet.db") as conn: | 77 | with sqlite3.connect("results/RNANet.db") as conn: |
73 | df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn) | 78 | df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn) |
... | @@ -89,13 +94,17 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -89,13 +94,17 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
89 | xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j] | 94 | xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j] |
90 | positions = np.vstack([xx.ravel(), yy.ravel()]) | 95 | positions = np.vstack([xx.ravel(), yy.ravel()]) |
91 | f_c3 = np.reshape(kernel_c3(positions).T, xx.shape) | 96 | f_c3 = np.reshape(kernel_c3(positions).T, xx.shape) |
97 | + pbar.update(1) | ||
92 | f_c2 = np.reshape(kernel_c2(positions).T, xx.shape) | 98 | f_c2 = np.reshape(kernel_c2(positions).T, xx.shape) |
99 | + pbar.update(1) | ||
93 | 100 | ||
94 | # Save the data to an archive for later use without the need to recompute | 101 | # Save the data to an archive for later use without the need to recompute |
95 | np.savez(f"data/wadley_kernel_{angle}.npz", | 102 | np.savez(f"data/wadley_kernel_{angle}.npz", |
96 | c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, | 103 | c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, |
97 | c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, | 104 | c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, |
98 | kernel_c3=f_c3, kernel_c2=f_c2) | 105 | kernel_c3=f_c3, kernel_c2=f_c2) |
106 | + pbar.close() | ||
107 | + idxQueue.put(thr_idx) | ||
99 | else: | 108 | else: |
100 | f = np.load(f"data/wadley_kernel_{angle}.npz") | 109 | f = np.load(f"data/wadley_kernel_{angle}.npz") |
101 | c2_endo_etas = f["c2_endo_e"] | 110 | c2_endo_etas = f["c2_endo_e"] |
... | @@ -106,7 +115,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -106,7 +115,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
106 | f_c2 = f["kernel_c2"] | 115 | f_c2 = f["kernel_c2"] |
107 | xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j] | 116 | xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j] |
108 | 117 | ||
109 | - notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).") | 118 | + # notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).") |
110 | 119 | ||
111 | # exact counts: | 120 | # exact counts: |
112 | hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), | 121 | hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), |
... | @@ -139,7 +148,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -139,7 +148,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
139 | fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png") | 148 | fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png") |
140 | if show: | 149 | if show: |
141 | fig.show() | 150 | fig.show() |
142 | - fig.close() | 151 | + plt.close() |
143 | 152 | ||
144 | # Smoothed joint distribution | 153 | # Smoothed joint distribution |
145 | fig = plt.figure() | 154 | fig = plt.figure() |
... | @@ -150,7 +159,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -150,7 +159,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
150 | fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png") | 159 | fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png") |
151 | if show: | 160 | if show: |
152 | fig.show() | 161 | fig.show() |
153 | - fig.close() | 162 | + plt.close() |
154 | 163 | ||
155 | # 2D Wadley plot | 164 | # 2D Wadley plot |
156 | fig = plt.figure(figsize=(5,5)) | 165 | fig = plt.figure(figsize=(5,5)) |
... | @@ -163,7 +172,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -163,7 +172,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
163 | fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png") | 172 | fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png") |
164 | if show: | 173 | if show: |
165 | fig.show() | 174 | fig.show() |
166 | - fig.close() | 175 | + plt.close() |
167 | # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") | 176 | # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") |
168 | 177 | ||
169 | def stats_len(): | 178 | def stats_len(): |
... | @@ -171,11 +180,15 @@ def stats_len(): | ... | @@ -171,11 +180,15 @@ def stats_len(): |
171 | 180 | ||
172 | REQUIRES tables chain, nucleotide up to date. | 181 | REQUIRES tables chain, nucleotide up to date. |
173 | """ | 182 | """ |
183 | + | ||
184 | + # Get a worker number to position the progress bar | ||
185 | + global idxQueue | ||
186 | + thr_idx = idxQueue.get() | ||
174 | 187 | ||
175 | cols = [] | 188 | cols = [] |
176 | lengths = [] | 189 | lengths = [] |
177 | - conn = sqlite3.connect("results/RNANet.db") | 190 | + |
178 | - for i,f in enumerate(fam_list): | 191 | + for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)): |
179 | 192 | ||
180 | # Define a color for that family in the plot | 193 | # Define a color for that family in the plot |
181 | if f in LSU_set: | 194 | if f in LSU_set: |
... | @@ -190,11 +203,11 @@ def stats_len(): | ... | @@ -190,11 +203,11 @@ def stats_len(): |
190 | cols.append("grey") | 203 | cols.append("grey") |
191 | 204 | ||
192 | # Get the lengths of chains | 205 | # Get the lengths of chains |
193 | - l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ] | 206 | + with sqlite3.connect("results/RNANet.db") as conn: |
207 | + l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ] | ||
194 | lengths.append(l) | 208 | lengths.append(l) |
195 | 209 | ||
196 | - notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") | 210 | + # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") |
197 | - conn.close() | ||
198 | 211 | ||
199 | # Plot the figure | 212 | # Plot the figure |
200 | fig = plt.figure(figsize=(10,3)) | 213 | fig = plt.figure(figsize=(10,3)) |
... | @@ -223,7 +236,8 @@ def stats_len(): | ... | @@ -223,7 +236,8 @@ def stats_len(): |
223 | 236 | ||
224 | # Save the figure | 237 | # Save the figure |
225 | fig.savefig("results/figures/lengths.png") | 238 | fig.savefig("results/figures/lengths.png") |
226 | - notify("Computed sequence length statistics and saved the figure.") | 239 | + idxQueue.put(thr_idx) # replace the thread index in the queue |
240 | + # notify("Computed sequence length statistics and saved the figure.") | ||
227 | 241 | ||
228 | def format_percentage(tot, x): | 242 | def format_percentage(tot, x): |
229 | if not tot: | 243 | if not tot: |
... | @@ -242,40 +256,57 @@ def stats_freq(): | ... | @@ -242,40 +256,57 @@ def stats_freq(): |
242 | 256 | ||
243 | Outputs results/frequencies.csv | 257 | Outputs results/frequencies.csv |
244 | REQUIRES tables chain, nucleotide up to date.""" | 258 | REQUIRES tables chain, nucleotide up to date.""" |
259 | + | ||
260 | + # Get a worker number to position the progress bar | ||
261 | + global idxQueue | ||
262 | + thr_idx = idxQueue.get() | ||
263 | + | ||
245 | # Initialize a Counter object for each family | 264 | # Initialize a Counter object for each family |
246 | freqs = {} | 265 | freqs = {} |
247 | for f in fam_list: | 266 | for f in fam_list: |
248 | freqs[f] = Counter() | 267 | freqs[f] = Counter() |
249 | 268 | ||
250 | # List all nt_names happening within a RNA family and store the counts in the Counter | 269 | # List all nt_names happening within a RNA family and store the counts in the Counter |
251 | - conn = sqlite3.connect("results/RNANet.db") | 270 | + for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)): |
252 | - for i,f in enumerate(fam_list): | 271 | + with sqlite3.connect("results/RNANet.db") as conn: |
253 | - counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;")) | 272 | + counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) |
254 | freqs[f].update(counts) | 273 | freqs[f].update(counts) |
255 | - notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.") | 274 | + # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.") |
256 | - conn.close() | ||
257 | 275 | ||
258 | # Create a pandas DataFrame, and save it to CSV. | 276 | # Create a pandas DataFrame, and save it to CSV. |
259 | df = pd.DataFrame() | 277 | df = pd.DataFrame() |
260 | - for f in fam_list: | 278 | + for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False): |
261 | tot = sum(freqs[f].values()) | 279 | tot = sum(freqs[f].values()) |
262 | df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) | 280 | df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) |
263 | df = df.fillna(0) | 281 | df = df.fillna(0) |
264 | df.to_csv("results/frequencies.csv") | 282 | df.to_csv("results/frequencies.csv") |
265 | - notify("Saved nucleotide frequencies to CSV file.") | 283 | + idxQueue.put(thr_idx) # replace the thread index in the queue |
284 | + # notify("Saved nucleotide frequencies to CSV file.") | ||
266 | 285 | ||
267 | def parallel_stats_pairs(f): | 286 | def parallel_stats_pairs(f): |
268 | """Counts occurrences of intra-chain base-pair types in one RNA family | 287 | """Counts occurrences of intra-chain base-pair types in one RNA family |
269 | 288 | ||
270 | REQUIRES tables chain, nucleotide up-to-date.""" | 289 | REQUIRES tables chain, nucleotide up-to-date.""" |
271 | 290 | ||
291 | + if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"): | ||
292 | + return | ||
293 | + | ||
294 | + # Get a worker number to position the progress bar | ||
295 | + global idxQueue | ||
296 | + thr_idx = idxQueue.get() | ||
297 | + | ||
272 | chain_id_list = mappings_list[f] | 298 | chain_id_list = mappings_list[f] |
273 | data = [] | 299 | data = [] |
274 | - for cid in chain_id_list: | 300 | + sqldata = [] |
301 | + for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False): | ||
275 | with sqlite3.connect("results/RNANet.db") as conn: | 302 | with sqlite3.connect("results/RNANet.db") as conn: |
276 | # Get comma separated lists of basepairs per nucleotide | 303 | # Get comma separated lists of basepairs per nucleotide |
277 | - interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn) | 304 | + interactions = pd.DataFrame( |
278 | - | 305 | + sql_ask_database(conn, |
306 | + f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", | ||
307 | + warn_every=0), | ||
308 | + columns = ["nt1", "index_chain", "paired", "pair_type_LW"] | ||
309 | + ) | ||
279 | # expand the comma-separated lists in real lists | 310 | # expand the comma-separated lists in real lists |
280 | expanded_list = pd.concat([ pd.DataFrame({ 'nt1':[ row["nt1"] for x in row["paired"].split(',') ], | 311 | expanded_list = pd.concat([ pd.DataFrame({ 'nt1':[ row["nt1"] for x in row["paired"].split(',') ], |
281 | 'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ], | 312 | 'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ], |
... | @@ -317,27 +348,29 @@ def parallel_stats_pairs(f): | ... | @@ -317,27 +348,29 @@ def parallel_stats_pairs(f): |
317 | 348 | ||
318 | # Update the database | 349 | # Update the database |
319 | vlcnts = expanded_list.pair_type_LW.value_counts() | 350 | vlcnts = expanded_list.pair_type_LW.value_counts() |
320 | - sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, | 351 | + sqldata.append( ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, |
321 | - vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, | 352 | + vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, |
322 | - vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, | 353 | + vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, |
323 | - vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, | 354 | + vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, |
324 | - vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, | 355 | + vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, |
325 | - vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, | 356 | + vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, |
326 | - vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, | 357 | + vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, |
327 | - vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, | 358 | + vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, |
328 | - vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, | 359 | + vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, |
329 | - vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, | 360 | + vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, |
330 | - vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, | 361 | + vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, |
331 | - vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, | 362 | + vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, |
332 | - int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), | 363 | + int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), |
333 | - cid) | 364 | + cid) ) |
334 | - with sqlite3.connect("results/RNANet.db") as conn: | ||
335 | - sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?, | ||
336 | - pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, | ||
337 | - pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata) | ||
338 | 365 | ||
339 | data.append(expanded_list) | 366 | data.append(expanded_list) |
340 | 367 | ||
368 | + # Update the database | ||
369 | + with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn: | ||
370 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
371 | + sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?, | ||
372 | + pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, | ||
373 | + pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", many=True, data=sqldata, warn_every=0) | ||
341 | 374 | ||
342 | # merge all the dataframes from all chains of the family | 375 | # merge all the dataframes from all chains of the family |
343 | expanded_list = pd.concat(data) | 376 | expanded_list = pd.concat(data) |
... | @@ -351,7 +384,106 @@ def parallel_stats_pairs(f): | ... | @@ -351,7 +384,106 @@ def parallel_stats_pairs(f): |
351 | 384 | ||
352 | # Create an output DataFrame | 385 | # Create an output DataFrame |
353 | f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) | 386 | f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) |
354 | - return expanded_list, f_df | 387 | + f_df.to_csv(f"data/{f}_counts.csv") |
388 | + expanded_list.to_csv(f"data/{f}_pairs.csv") | ||
389 | + | ||
390 | + idxQueue.put(thr_idx) # replace the thread index in the queue | ||
391 | + | ||
392 | +def to_dist_matrix(f): | ||
393 | + if path.isfile("data/"+f+".npy"): | ||
394 | + # notify(f"Computed {f} distance matrix", "loaded from file") | ||
395 | + return 0 | ||
396 | + | ||
397 | + # Get a worker number to position the progress bar | ||
398 | + global idxQueue | ||
399 | + thr_idx = idxQueue.get() | ||
400 | + | ||
401 | + # notify(f"Computing {f} distance matrix from alignment...") | ||
402 | + command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk" | ||
403 | + | ||
404 | + # Prepare a file | ||
405 | + with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: | ||
406 | + al = AlignIO.read(al_file, "fasta") | ||
407 | + names = [ x.id for x in al if '[' in x.id ] | ||
408 | + al = al[-len(names):] | ||
409 | + with open(f + "_3d_only.stk", "w") as only_3d: | ||
410 | + only_3d.write(al.format("stockholm")) | ||
411 | + del al | ||
412 | + | ||
413 | + # Prepare the job | ||
414 | + process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) | ||
415 | + id_matrix = np.zeros((len(names), len(names))) | ||
416 | + | ||
417 | + pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False) | ||
418 | + while process.poll() is None: | ||
419 | + output = process.stdout.readline() | ||
420 | + if output: | ||
421 | + lines = output.strip().split(b'\n') | ||
422 | + for l in lines: | ||
423 | + line = l.split() | ||
424 | + s1 = line[0].decode('utf-8') | ||
425 | + s2 = line[1].decode('utf-8') | ||
426 | + score = line[2].decode('utf-8') | ||
427 | + id1 = names.index(s1) | ||
428 | + id2 = names.index(s2) | ||
429 | + id_matrix[id1, id2] = float(score) | ||
430 | + pbar.update(1) | ||
431 | + pbar.close() | ||
432 | + | ||
433 | + subprocess.run(["rm", "-f", f + "_3d_only.stk"]) | ||
434 | + np.save("data/"+f+".npy", id_matrix) | ||
435 | + idxQueue.put(thr_idx) # replace the thread index in the queue | ||
436 | + return 0 | ||
437 | + | ||
438 | +def seq_idty(): | ||
439 | + """Computes identity matrices for each of the RNA families. | ||
440 | + | ||
441 | + REQUIRES temporary results files in data/*.npy | ||
442 | + REQUIRES tables chain, family un to date.""" | ||
443 | + | ||
444 | + # load distance matrices | ||
445 | + fam_arrays = [] | ||
446 | + for f in famlist: | ||
447 | + if path.isfile("data/"+f+".npy"): | ||
448 | + fam_arrays.append(np.load("data/"+f+".npy")) | ||
449 | + else: | ||
450 | + fam_arrays.append([]) | ||
451 | + | ||
452 | + # Update database with identity percentages | ||
453 | + conn = sqlite3.connect("results/RNANet.db") | ||
454 | + for f, D in zip(famlist, fam_arrays): | ||
455 | + if not len(D): continue | ||
456 | + a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix | ||
457 | + conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';") | ||
458 | + conn.commit() | ||
459 | + conn.close() | ||
460 | + | ||
461 | + # Plots plots plots | ||
462 | + fig, axs = plt.subplots(4,17, figsize=(17,5.75)) | ||
463 | + axs = axs.ravel() | ||
464 | + [axi.set_axis_off() for axi in axs] | ||
465 | + im = "" # Just to declare the variable, it will be set in the loop | ||
466 | + for f, D, ax in zip(famlist, fam_arrays, axs): | ||
467 | + if not len(D): continue | ||
468 | + if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize | ||
469 | + D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix | ||
470 | + condensedD = squareform(D) | ||
471 | + | ||
472 | + # Compute basic dendrogram by Ward's method | ||
473 | + Y = sch.linkage(condensedD, method='ward') | ||
474 | + Z = sch.dendrogram(Y, orientation='left', no_plot=True) | ||
475 | + | ||
476 | + # Reorganize rows and cols | ||
477 | + idx1 = Z['leaves'] | ||
478 | + D = D[idx1,:] | ||
479 | + D = D[:,idx1[::-1]] | ||
480 | + im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D | ||
481 | + ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10) | ||
482 | + fig.tight_layout() | ||
483 | + fig.subplots_adjust(wspace=0.1, hspace=0.3) | ||
484 | + fig.colorbar(im, ax=axs[-1], shrink=0.8) | ||
485 | + fig.savefig(f"results/figures/distances.png") | ||
486 | + notify("Computed all identity matrices and saved the figure.") | ||
355 | 487 | ||
356 | def stats_pairs(): | 488 | def stats_pairs(): |
357 | """Counts occurrences of intra-chain base-pair types in RNA families | 489 | """Counts occurrences of intra-chain base-pair types in RNA families |
... | @@ -363,26 +495,15 @@ def stats_pairs(): | ... | @@ -363,26 +495,15 @@ def stats_pairs(): |
363 | return family_data.apply(partial(format_percentage, sum(family_data))) | 495 | return family_data.apply(partial(format_percentage, sum(family_data))) |
364 | 496 | ||
365 | if not path.isfile("data/pair_counts.csv"): | 497 | if not path.isfile("data/pair_counts.csv"): |
366 | - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=read_cpu_number(), maxtasksperchild=5) | 498 | + results = [] |
367 | - try: | 499 | + allpairs = [] |
368 | - fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) | 500 | + for f in fam_list: |
369 | - results = [] | 501 | + newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0) |
370 | - allpairs = [] | 502 | + fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0) |
371 | - for _, newp_famdf in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)): | 503 | + results.append(fam_df) |
372 | - newpairs, fam_df = newp_famdf | 504 | + allpairs.append(newpairs) |
373 | - fam_pbar.update(1) | 505 | + subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"]) |
374 | - results.append(fam_df) | 506 | + subprocess.run(["rm", "-f", f"data/{f}_counts.csv"]) |
375 | - allpairs.append(newpairs) | ||
376 | - fam_pbar.close() | ||
377 | - p.close() | ||
378 | - p.join() | ||
379 | - except KeyboardInterrupt: | ||
380 | - warn("KeyboardInterrupt, terminating workers.", error=True) | ||
381 | - fam_pbar.close() | ||
382 | - p.terminate() | ||
383 | - p.join() | ||
384 | - exit(1) | ||
385 | - | ||
386 | all_pairs = pd.concat(allpairs) | 507 | all_pairs = pd.concat(allpairs) |
387 | df = pd.concat(results).fillna(0) | 508 | df = pd.concat(results).fillna(0) |
388 | df.to_csv("data/pair_counts.csv") | 509 | df.to_csv("data/pair_counts.csv") |
... | @@ -431,92 +552,12 @@ def stats_pairs(): | ... | @@ -431,92 +552,12 @@ def stats_pairs(): |
431 | 552 | ||
432 | notify("Computed nucleotide statistics and saved CSV and PNG file.") | 553 | notify("Computed nucleotide statistics and saved CSV and PNG file.") |
433 | 554 | ||
434 | -def to_dist_matrix(f): | ||
435 | - if path.isfile("data/"+f+".npy"): | ||
436 | - notify(f"Computed {f} distance matrix", "loaded from file") | ||
437 | - return 0 | ||
438 | - | ||
439 | - notify(f"Computing {f} distance matrix from alignment...") | ||
440 | - dm = DistanceCalculator('identity') | ||
441 | - with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file: | ||
442 | - al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):] | ||
443 | - idty = dm.get_distance(al).matrix # list of lists | ||
444 | - del al | ||
445 | - l = len(idty) | ||
446 | - np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ], dtype=object)) | ||
447 | - del idty | ||
448 | - notify(f"Computed {f} distance matrix") | ||
449 | - return 0 | ||
450 | - | ||
451 | -def seq_idty(): | ||
452 | - """Computes identity matrices for each of the RNA families. | ||
453 | - | ||
454 | - Creates temporary results files in data/*.npy | ||
455 | - REQUIRES tables chain, family un to date.""" | ||
456 | - | ||
457 | - # List the families for which we will compute sequence identity matrices | ||
458 | - conn = sqlite3.connect("results/RNANet.db") | ||
459 | - famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ] | ||
460 | - ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ] | ||
461 | - if len(ignored): | ||
462 | - print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') | ||
463 | - | ||
464 | - # compute distance matrices (or ignore if data/RF0****.npy exists) | ||
465 | - p = Pool(processes=8) | ||
466 | - p.map(to_dist_matrix, famlist) | ||
467 | - p.close() | ||
468 | - p.join() | ||
469 | - | ||
470 | - # load them | ||
471 | - fam_arrays = [] | ||
472 | - for f in famlist: | ||
473 | - if path.isfile("data/"+f+".npy"): | ||
474 | - fam_arrays.append(np.load("data/"+f+".npy")) | ||
475 | - else: | ||
476 | - fam_arrays.append([]) | ||
477 | - | ||
478 | - # Update database with identity percentages | ||
479 | - conn = sqlite3.connect("results/RNANet.db") | ||
480 | - for f, D in zip(famlist, fam_arrays): | ||
481 | - if not len(D): continue | ||
482 | - a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix | ||
483 | - conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';") | ||
484 | - conn.commit() | ||
485 | - conn.close() | ||
486 | - | ||
487 | - # Plots plots plots | ||
488 | - fig, axs = plt.subplots(4,17, figsize=(17,5.75)) | ||
489 | - axs = axs.ravel() | ||
490 | - [axi.set_axis_off() for axi in axs] | ||
491 | - im = "" # Just to declare the variable, it will be set in the loop | ||
492 | - for f, D, ax in zip(famlist, fam_arrays, axs): | ||
493 | - if not len(D): continue | ||
494 | - if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize | ||
495 | - D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix | ||
496 | - condensedD = squareform(D) | ||
497 | - | ||
498 | - # Compute basic dendrogram by Ward's method | ||
499 | - Y = sch.linkage(condensedD, method='ward') | ||
500 | - Z = sch.dendrogram(Y, orientation='left', no_plot=True) | ||
501 | - | ||
502 | - # Reorganize rows and cols | ||
503 | - idx1 = Z['leaves'] | ||
504 | - D = D[idx1,:] | ||
505 | - D = D[:,idx1[::-1]] | ||
506 | - im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D | ||
507 | - ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10) | ||
508 | - fig.tight_layout() | ||
509 | - fig.subplots_adjust(wspace=0.1, hspace=0.3) | ||
510 | - fig.colorbar(im, ax=axs[-1], shrink=0.8) | ||
511 | - fig.savefig(f"results/figures/distances.png") | ||
512 | - notify("Computed all identity matrices and saved the figure.") | ||
513 | - | ||
514 | def per_chain_stats(): | 555 | def per_chain_stats(): |
515 | """Computes per-chain frequencies and base-pair type counts. | 556 | """Computes per-chain frequencies and base-pair type counts. |
516 | 557 | ||
517 | REQUIRES tables chain, nucleotide up to date. """ | 558 | REQUIRES tables chain, nucleotide up to date. """ |
518 | 559 | ||
519 | - with sqlite3.connect("results/RNANet.db") as conn: | 560 | + with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn: |
520 | # Compute per-chain nucleotide frequencies | 561 | # Compute per-chain nucleotide frequencies |
521 | df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) | 562 | df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) |
522 | df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) | 563 | df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) |
... | @@ -524,39 +565,74 @@ def per_chain_stats(): | ... | @@ -524,39 +565,74 @@ def per_chain_stats(): |
524 | df = df.drop("total", axis=1) | 565 | df = df.drop("total", axis=1) |
525 | 566 | ||
526 | # Set the values | 567 | # Set the values |
568 | + conn.execute('pragma journal_mode=wal') | ||
527 | sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", | 569 | sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", |
528 | many=True, data=list(df.to_records(index=False)), warn_every=10) | 570 | many=True, data=list(df.to_records(index=False)), warn_every=10) |
529 | notify("Updated the database with per-chain base frequencies") | 571 | notify("Updated the database with per-chain base frequencies") |
530 | 572 | ||
573 | +def log_to_pbar(pbar): | ||
574 | + def update(r): | ||
575 | + pbar.update(1) | ||
576 | + return update | ||
577 | + | ||
531 | if __name__ == "__main__": | 578 | if __name__ == "__main__": |
532 | 579 | ||
533 | os.makedirs("results/figures/wadley_plots/", exist_ok=True) | 580 | os.makedirs("results/figures/wadley_plots/", exist_ok=True) |
534 | 581 | ||
535 | print("Loading mappings list...") | 582 | print("Loading mappings list...") |
536 | - conn = sqlite3.connect("results/RNANet.db") | 583 | + with sqlite3.connect("results/RNANet.db") as conn: |
537 | - fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ] | 584 | + fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ] |
538 | - mappings_list = {} | 585 | + mappings_list = {} |
539 | - for k in fam_list: | 586 | + for k in fam_list: |
540 | - mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ] | 587 | + mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ] |
541 | - conn.close() | ||
542 | - | ||
543 | - # stats_pairs() | ||
544 | - | ||
545 | - # Define threads for the tasks | ||
546 | - threads = [ | ||
547 | - th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), | ||
548 | - th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), | ||
549 | - th.Thread(target=stats_len), # computes figures | ||
550 | - th.Thread(target=stats_freq), # Updates the database | ||
551 | - th.Thread(target=seq_idty), # produces .npy files and seq idty figures | ||
552 | - th.Thread(target=per_chain_stats) # Updates the database | ||
553 | - ] | ||
554 | - | ||
555 | - # Start the threads | ||
556 | - for t in threads: | ||
557 | - t.start() | ||
558 | - | ||
559 | - # Wait for the threads to complete | ||
560 | - for t in threads: | ||
561 | - t.join() | ||
562 | 588 | ||
589 | + # List the families for which we will compute sequence identity matrices | ||
590 | + with sqlite3.connect("results/RNANet.db") as conn: | ||
591 | + famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ] | ||
592 | + ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ] | ||
593 | + if len(ignored): | ||
594 | + print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') | ||
595 | + | ||
596 | + # Prepare the multiprocessing execution environment | ||
597 | + nworkers = max(read_cpu_number()-1, 32) | ||
598 | + thr_idx_mgr = Manager() | ||
599 | + idxQueue = thr_idx_mgr.Queue() | ||
600 | + for i in range(nworkers): | ||
601 | + idxQueue.put(i) | ||
602 | + | ||
603 | + # Define the tasks | ||
604 | + joblist = [] | ||
605 | + joblist.append(Job(function=reproduce_wadley_results, args=(1,))) | ||
606 | + joblist.append(Job(function=reproduce_wadley_results, args=(4,))) | ||
607 | + joblist.append(Job(function=stats_len)) # Computes figures | ||
608 | + joblist.append(Job(function=stats_freq)) # updates the database | ||
609 | + for f in famlist: | ||
610 | + joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database | ||
611 | + if f not in ignored: | ||
612 | + joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database | ||
613 | + | ||
614 | + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) | ||
615 | + pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True) | ||
616 | +sqlite3 | ||
617 | + try: | ||
618 | + for j in joblist: | ||
619 | + p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar)) | ||
620 | + p.close() | ||
621 | + p.join() | ||
622 | + pbar.close() | ||
623 | + except KeyboardInterrupt: | ||
624 | + warn("KeyboardInterrupt, terminating workers.", error=True) | ||
625 | + p.terminate() | ||
626 | + p.join() | ||
627 | + pbar.close() | ||
628 | + exit(1) | ||
629 | + except: | ||
630 | + print("Something went wrong") | ||
631 | + | ||
632 | + print() | ||
633 | + print() | ||
634 | + | ||
635 | + # finish the work after the parallel portions | ||
636 | + per_chain_stats() | ||
637 | + seq_idty() | ||
638 | + stats_pairs() | ... | ... |
-
Please register or login to post a comment