Louis BECQUEY

Merge branch 'master' of https://github.com/persalteas/RNANet into master

......@@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n
This script prepares the dataset from available public data in PDB and Rfam.
**Please cite**: *Coming soon, expect it summer 2020*
**Please cite**: *Coming soon, expect it in 2021*
# What it does
The script follows these steps:
......@@ -72,7 +72,7 @@ You need to install:
## Command line
Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`.
It requires solid hardware to run. It takes around 15 hours the first time, and 9h then, tested on a server with 32 cores and 48GB of RAM.
It requires solid hardware to run. It takes around around 12 to 15 hours the first time, and 1 to 3h then, tested on a server with 32 cores and 48GB of RAM.
The detailed list of options is below:
```
......
......@@ -273,32 +273,39 @@ class Chain:
if self.mapping is not None:
self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
# The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
# They are all contiguous in the chain
# 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
# there is a redundancy (611) followed by a gap (611-617).
# We want the redundancy to fill the gap.
df.iloc[i:i+n_dup-1, 1] += 1
try:
if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
# The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
# They are all contiguous in the chain
# 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
# there is a redundancy (611) followed by a gap (611-617).
# We want the redundancy to fill the gap.
df.iloc[i:i+n_dup-1, 1] += 1
else:
# We solve the problem continous component by continuous component
for j in range(1, n_dup+1):
if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
df.iloc[i+j-1,1] += 1
else:
break
elif df.iloc[i,1] == df.iloc[i-1,1]:
# Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
# Solution : we shift the numbering of 17A (to 18) and the following residues.
df.iloc[i:, 1] += 1
else:
# We solve the problem continous component by continuous component
for j in range(1, n_dup+1):
if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
df.iloc[i+j-1,1] += 1
else:
break
elif df.iloc[i,1] == df.iloc[i-1,1]:
# Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
# Solution : we shift the numbering of 17A (to 18) and the following residues.
df.iloc[i:, 1] += 1
else:
# 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
# Here the 163B is read 163 by DSSR, but there already is a residue 163.
# Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
df.iloc[i+1:, 1] += 1
# 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
# Here the 163B is read 163 by DSSR, but there already is a residue 163.
# Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
df.iloc[i+1:, 1] += 1
except:
warn(f"Error with parsing of {self.chain_label} duplicate residue numbers. Ignoring it.")
self.delete_me = True
self.error_messages = f"Error with parsing of duplicate residues numbers."
return None
# Search for ligands at the end of the selection
# Drop ligands detected as residues by DSSR, by detecting several markers
......@@ -1019,7 +1026,7 @@ class Pipeline:
print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &")
sys.exit()
elif opt == '--version':
print("RNANet 1.0 alpha ")
print("RNANet 1.1 beta")
sys.exit()
elif opt == "-r" or opt == "--resolution":
assert float(arg) > 0.0 and float(arg) <= 20.0
......@@ -1382,7 +1389,7 @@ class Pipeline:
# Remove previous precomputed data
subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
for f in self.fam_list:
subprocess.run(["rm","-f", f"data/{f}.npy"])
subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
# Run statistics files
os.chdir(runDir)
......@@ -1390,13 +1397,12 @@ class Pipeline:
subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
# Save additional informations
conn = sqlite3.connect(runDir+"/results/RNANet.db")
pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
JOIN chain ON structure.pdb_id = chain.structure_id
ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
conn.close()
with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
JOIN chain ON structure.pdb_id = chain.structure_id
ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
# Archive the results
if self.SELECT_ONLY is None:
......@@ -1404,7 +1410,10 @@ class Pipeline:
subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
# Update shortcuts to latest versions
subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"])
subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
runDir + "/results/summary_latest.csv",
runDir + "/results/families_latest.csv"
])
subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
......@@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10):
@trace_unhandled_exceptions
def sql_execute(conn, sql, many=False, data=None, warn_every=10):
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
for _ in range(100): # retry 100 times if it fails
try:
if many:
......@@ -2397,6 +2407,7 @@ if __name__ == "__main__":
rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
else:
rfam_acc_to_download[c.mapping.rfam_acc].append(c)
print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
pp.fam_list = sorted(rfam_acc_to_download.keys())
......
# This is a script supposed to be run periodically as a cron job
cd /home/lbecquey/Projects/RNANet
rm -f latest_run.log errors.txt
# Run RNANet
cd /home/lbecquey/Projects/RNANet;
rm -f stdout.txt stderr.txt errors.txt;
time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt;
bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log
touch results/RNANet.db # update last modification date
rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
# Sync in Seafile
seaf-cli start;
# Compress
rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz
echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log
gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db
echo 'Recreated it.' >> latest_run.log
seaf-cli stop;
# Sync in Seafile
seaf-cli start >> latest_run.log 2>&1
echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
sleep 10m
echo `seaf-cli status` >> latest_run.log
seaf-cli stop >> latest_run.log 2>&1
echo 'We are '`date`', update completed.' >> latest_run.log
......
......@@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do
fi
done
PROCESS_TO_KILL="statistics.py"
PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'`
KILLED=
for KILLPID in $PROCESS_LIST; do
if [ ! -z $KILLPID ];then
kill -9 $KILLPID
echo "Killed PID ${KILLPID}"
KILLED=yes
fi
done
if [ -z $KILLED ];then
echo "Didn't kill anything"
fi
......
1ml5_1_a_1-2914
1ml5_1_a_151-2903
1ml5_1_A_7-1518
1ml5_1_A_7-1515
1ml5_1_A_2-1520
1ml5_1_b_5-121
2rdo_1_A_3-118
4v48_1_A9_3-118
4v47_1_A9_3-118
1vy7_1_AY_1-73
1vy7_1_CY_1-73
4w2h_1_CY_1-73
6zmi_1_L8_1267-4755
6zm7_1_L8_1267-4755
6y6x_1_L8_1267-4755
6z6n_1_L8_1267-4755
6qzp_1_L8_1267-4755
6zme_1_L8_1267-4755
6z6l_1_L8_1267-4755
6ek0_1_L8_1267-4755
6zmo_1_L8_1267-4755
6z6m_1_L8_1267-4755
6ole_1_D_1267-4755
6om0_1_D_1267-4755
6y2l_1_L8_1267-4755
6y0g_1_L8_1267-4755
6oli_1_D_1267-4755
6olg_1_A3_1267-4755
6y57_1_L8_1267-4755
5t2c_1_C_1267-4755
6om7_1_D_1267-4755
4ug0_1_L8_1267-4755
6olf_1_D_1267-4755
6ip5_1_1C_1267-4755
6ip8_1_1C_1267-4755
6olz_1_A3_1267-4755
5aj0_1_A3_1267-4755
5lks_1_L8_1267-4755
6ip6_1_1C_1267-4755
4v6x_1_A8_1267-4755
2z9q_1_A_1-72
1ls2_1_B_1-73
3ep2_1_Y_1-72
3eq3_1_Y_1-72
4v48_1_A6_1-73
1gsg_1_T_1-72
3jcr_1_H_1-115
1eg0_1_O_1-73
4v42_1_BB_5-121
4v42_1_BA_1-2914
4v42_1_BA_151-2903
2ob7_1_A_10-319
1x1l_1_A_1-130
1zc8_1_Z_1-130
1zc8_1_Z_1-91
2ob7_1_D_1-130
1r2x_1_C_1-58
1r2w_1_C_1-58
1eg0_1_L_1-56
1eg0_1_L_1-57
6rxu_1_C2_588-2386
6rxu_1_C2_588-2383
6rxu_1_C2_583-2388
5oql_1_2_588-2386
5oql_1_2_588-2383
5oql_1_2_583-2388
6rxv_1_C2_588-2386
6rxv_1_C2_588-2383
6rxv_1_C2_583-2388
6rxz_1_C2_588-2386
6rxz_1_C2_588-2383
6rxz_1_C2_583-2388
6rxy_1_C2_588-2386
6rxy_1_C2_588-2383
6rxy_1_C2_583-2388
6rxt_1_C2_588-2386
6rxt_1_C2_588-2383
6rxt_1_C2_583-2388
4v48_1_BA_1-91
4v48_1_BA_6-1541
4v48_1_BA_6-1538
4v48_1_BA_1-1543
4v47_1_BA_1-91
4v47_1_BA_6-1540
4v47_1_BA_6-1537
4v47_1_BA_1-1542
2rdo_1_B_6-1460
2rdo_1_B_6-1522
2rdo_1_B_1-2903
2rdo_1_B_6-1457
2rdo_1_B_1-2904
2rdo_1_B_1-1528
2rdo_1_B_160-2893
4v48_1_A0_6-1460
4v48_1_A0_6-1522
4v48_1_A0_1-2903
4v48_1_A0_6-1457
4v48_1_A0_1-2904
4v48_1_A0_1-1528
4v48_1_A0_160-2893
4v47_1_A0_6-1460
4v47_1_A0_6-1522
4v47_1_A0_1-2903
4v47_1_A0_6-1457
4v47_1_A0_1-2904
4v47_1_A0_1-1528
4v47_1_A0_160-2893
1zc8_1_A_1-59
1mvr_1_D_1-59
4c9d_1_D_29-1
4c9d_1_C_29-1
4adx_1_9_1-121
1zn1_1_B_1-59
1emi_1_B_1-108
3iy9_1_A_498-1027
1jgq_1_A_20-55
1jgq_1_A_7-1518
1jgq_1_A_7-1515
1jgq_1_A_2-1520
4v42_1_AA_20-55
4v42_1_AA_7-1518
4v42_1_AA_7-1515
4v42_1_AA_2-1520
1jgo_1_A_20-55
1jgo_1_A_7-1518
1jgo_1_A_7-1515
1jgo_1_A_2-1520
1jgp_1_A_20-55
1jgp_1_A_7-1518
1jgp_1_A_7-1515
1jgp_1_A_2-1520
3ep2_1_B_1-50
3eq3_1_B_1-50
3eq4_1_B_1-50
3pgw_1_R_1-164
3pgw_1_N_1-164
3cw1_1_x_1-138
3cw1_1_w_1-138
3cw1_1_V_1-138
3cw1_1_v_1-138
2iy3_1_B_9-105
3jcr_1_N_1-106
3jcr_1_N_1-188
2vaz_1_A_64-177
2ftc_1_R_81-1466
2ftc_1_R_1-1568
2ftc_1_R_792-1568
3jcr_1_M_1-141
3jcr_1_M_1-107
3jcr_1_M_1-188
4v5z_1_B0_1-2840
4v5z_1_B0_1-2899
4v5z_1_B0_1-2902
5g2x_1_A_595-692
3iy8_1_A_1-540
4v5z_1_BY_2-113
4v5z_1_BZ_1-70
1mvr_1_B_1-96
4adx_1_0_1-2923
4adx_1_0_132-2915
3eq4_1_Y_1-69
4v5z_1_AA_1-1562
4v5z_1_AA_1-1563
6lqm_1_8_1267-4755
6lu8_1_8_1267-4755
6lsr_1_8_1267-4755
6lss_1_8_1267-4755
1ml5_1_a_1-2914
Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1ml5_1_a_151-2903
Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1ml5_1_A_7-1518
Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1ml5_1_A_7-1515
Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1ml5_1_A_2-1520
Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1ml5_1_b_5-121
Could not find nucleotides of chain b in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2rdo_1_A_3-118
DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_A_3-118.
4v48_1_A9_3-118
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A9_3-118.
4v47_1_A9_3-118
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A9_3-118.
1vy7_1_AY_1-73
Sequence is too short. (< 5 resolved nts)
1vy7_1_CY_1-73
Sequence is too short. (< 5 resolved nts)
4w2h_1_CY_1-73
Sequence is too short. (< 5 resolved nts)
6zmi_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6zmi.json. Either there is a problem with 6zmi mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6zm7_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6zm7.json. Either there is a problem with 6zm7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6y6x_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6y6x.json. Either there is a problem with 6y6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6z6n_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6z6n.json. Either there is a problem with 6z6n mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6qzp_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6qzp.json. Either there is a problem with 6qzp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6zme_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6zme.json. Either there is a problem with 6zme mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6z6l_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6z6l.json. Either there is a problem with 6z6l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6ek0_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6ek0.json. Either there is a problem with 6ek0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6zmo_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6zmo.json. Either there is a problem with 6zmo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6z6m_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6z6m.json. Either there is a problem with 6z6m mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6ole_1_D_1267-4755
Could not find nucleotides of chain D in annotation 6ole.json. Either there is a problem with 6ole mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6om0_1_D_1267-4755
Could not find nucleotides of chain D in annotation 6om0.json. Either there is a problem with 6om0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6y2l_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6y2l.json. Either there is a problem with 6y2l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6y0g_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6y0g.json. Either there is a problem with 6y0g mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6oli_1_D_1267-4755
Could not find nucleotides of chain D in annotation 6oli.json. Either there is a problem with 6oli mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6olg_1_A3_1267-4755
Could not find nucleotides of chain A3 in annotation 6olg.json. Either there is a problem with 6olg mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6y57_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 6y57.json. Either there is a problem with 6y57 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5t2c_1_C_1267-4755
Could not find nucleotides of chain C in annotation 5t2c.json. Either there is a problem with 5t2c mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6om7_1_D_1267-4755
Could not find nucleotides of chain D in annotation 6om7.json. Either there is a problem with 6om7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4ug0_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 4ug0.json. Either there is a problem with 4ug0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6olf_1_D_1267-4755
Could not find nucleotides of chain D in annotation 6olf.json. Either there is a problem with 6olf mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6ip5_1_1C_1267-4755
Could not find nucleotides of chain 1C in annotation 6ip5.json. Either there is a problem with 6ip5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6ip8_1_1C_1267-4755
Could not find nucleotides of chain 1C in annotation 6ip8.json. Either there is a problem with 6ip8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6olz_1_A3_1267-4755
Could not find nucleotides of chain A3 in annotation 6olz.json. Either there is a problem with 6olz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5aj0_1_A3_1267-4755
Could not find nucleotides of chain A3 in annotation 5aj0.json. Either there is a problem with 5aj0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5lks_1_L8_1267-4755
Could not find nucleotides of chain L8 in annotation 5lks.json. Either there is a problem with 5lks mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6ip6_1_1C_1267-4755
Could not find nucleotides of chain 1C in annotation 6ip6.json. Either there is a problem with 6ip6 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v6x_1_A8_1267-4755
Could not find nucleotides of chain A8 in annotation 4v6x.json. Either there is a problem with 4v6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2z9q_1_A_1-72
DSSR warning 2z9q.json: no nucleotides found. Ignoring 2z9q_1_A_1-72.
1ls2_1_B_1-73
DSSR warning 1ls2.json: no nucleotides found. Ignoring 1ls2_1_B_1-73.
3ep2_1_Y_1-72
DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_Y_1-72.
3eq3_1_Y_1-72
DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_Y_1-72.
4v48_1_A6_1-73
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A6_1-73.
1gsg_1_T_1-72
DSSR warning 1gsg.json: no nucleotides found. Ignoring 1gsg_1_T_1-72.
3jcr_1_H_1-115
DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_H_1-115.
1eg0_1_O_1-73
DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_O_1-73.
4v42_1_BB_5-121
Could not find nucleotides of chain BB in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v42_1_BA_1-2914
Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v42_1_BA_151-2903
Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2ob7_1_A_10-319
DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_A_10-319.
1x1l_1_A_1-130
DSSR warning 1x1l.json: no nucleotides found. Ignoring 1x1l_1_A_1-130.
1zc8_1_Z_1-130
DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-130.
1zc8_1_Z_1-91
DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-91.
2ob7_1_D_1-130
DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_D_1-130.
1r2x_1_C_1-58
DSSR warning 1r2x.json: no nucleotides found. Ignoring 1r2x_1_C_1-58.
1r2w_1_C_1-58
DSSR warning 1r2w.json: no nucleotides found. Ignoring 1r2w_1_C_1-58.
1eg0_1_L_1-56
DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-56.
1eg0_1_L_1-57
DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-57.
6rxu_1_C2_588-2386
Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxu_1_C2_588-2383
Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxu_1_C2_583-2388
Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5oql_1_2_588-2386
Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5oql_1_2_588-2383
Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5oql_1_2_583-2388
Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxv_1_C2_588-2386
Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxv_1_C2_588-2383
Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxv_1_C2_583-2388
Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxz_1_C2_588-2386
Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxz_1_C2_588-2383
Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxz_1_C2_583-2388
Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxy_1_C2_588-2386
Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxy_1_C2_588-2383
Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxy_1_C2_583-2388
Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxt_1_C2_588-2386
Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxt_1_C2_588-2383
Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6rxt_1_C2_583-2388
Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v48_1_BA_1-91
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-91.
4v48_1_BA_6-1541
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1541.
4v48_1_BA_6-1538
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1538.
4v48_1_BA_1-1543
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-1543.
4v47_1_BA_1-91
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-91.
4v47_1_BA_6-1540
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1540.
4v47_1_BA_6-1537
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1537.
4v47_1_BA_1-1542
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-1542.
2rdo_1_B_6-1460
DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1460.
2rdo_1_B_6-1522
DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1522.
2rdo_1_B_1-2903
DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2903.
2rdo_1_B_6-1457
DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1457.
2rdo_1_B_1-2904
DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2904.
2rdo_1_B_1-1528
DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-1528.
2rdo_1_B_160-2893
DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_160-2893.
4v48_1_A0_6-1460
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1460.
4v48_1_A0_6-1522
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1522.
4v48_1_A0_1-2903
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2903.
4v48_1_A0_6-1457
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1457.
4v48_1_A0_1-2904
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2904.
4v48_1_A0_1-1528
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-1528.
4v48_1_A0_160-2893
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_160-2893.
4v47_1_A0_6-1460
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1460.
4v47_1_A0_6-1522
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1522.
4v47_1_A0_1-2903
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2903.
4v47_1_A0_6-1457
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1457.
4v47_1_A0_1-2904
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2904.
4v47_1_A0_1-1528
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-1528.
4v47_1_A0_160-2893
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_160-2893.
1zc8_1_A_1-59
DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_A_1-59.
1mvr_1_D_1-59
DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_D_1-59.
4c9d_1_D_29-1
Mapping is reversed, this case is not supported (yet).
4c9d_1_C_29-1
Mapping is reversed, this case is not supported (yet).
4adx_1_9_1-121
DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_9_1-121.
1zn1_1_B_1-59
DSSR warning 1zn1.json: no nucleotides found. Ignoring 1zn1_1_B_1-59.
1emi_1_B_1-108
DSSR warning 1emi.json: no nucleotides found. Ignoring 1emi_1_B_1-108.
3iy9_1_A_498-1027
DSSR warning 3iy9.json: no nucleotides found. Ignoring 3iy9_1_A_498-1027.
1jgq_1_A_20-55
Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgq_1_A_7-1518
Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgq_1_A_7-1515
Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgq_1_A_2-1520
Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v42_1_AA_20-55
Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v42_1_AA_7-1518
Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v42_1_AA_7-1515
Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v42_1_AA_2-1520
Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgo_1_A_20-55
Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgo_1_A_7-1518
Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgo_1_A_7-1515
Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgo_1_A_2-1520
Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgp_1_A_20-55
Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgp_1_A_7-1518
Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgp_1_A_7-1515
Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1jgp_1_A_2-1520
Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3ep2_1_B_1-50
DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_B_1-50.
3eq3_1_B_1-50
DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_B_1-50.
3eq4_1_B_1-50
DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_B_1-50.
3pgw_1_R_1-164
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R_1-164.
3pgw_1_N_1-164
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N_1-164.
3cw1_1_x_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_x_1-138.
3cw1_1_w_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_w_1-138.
3cw1_1_V_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V_1-138.
3cw1_1_v_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_v_1-138.
2iy3_1_B_9-105
DSSR warning 2iy3.json: no nucleotides found. Ignoring 2iy3_1_B_9-105.
3jcr_1_N_1-106
DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-106.
3jcr_1_N_1-188
DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-188.
2vaz_1_A_64-177
DSSR warning 2vaz.json: no nucleotides found. Ignoring 2vaz_1_A_64-177.
2ftc_1_R_81-1466
DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_81-1466.
2ftc_1_R_1-1568
DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_1-1568.
2ftc_1_R_792-1568
DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_792-1568.
3jcr_1_M_1-141
DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-141.
3jcr_1_M_1-107
DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-107.
3jcr_1_M_1-188
DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-188.
4v5z_1_B0_1-2840
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2840.
4v5z_1_B0_1-2899
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2899.
4v5z_1_B0_1-2902
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2902.
5g2x_1_A_595-692
Sequence is too short. (< 5 resolved nts)
3iy8_1_A_1-540
DSSR warning 3iy8.json: no nucleotides found. Ignoring 3iy8_1_A_1-540.
4v5z_1_BY_2-113
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BY_2-113.
4v5z_1_BZ_1-70
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BZ_1-70.
1mvr_1_B_1-96
DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_B_1-96.
4adx_1_0_1-2923
DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_1-2923.
4adx_1_0_132-2915
DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_132-2915.
3eq4_1_Y_1-69
DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_Y_1-69.
4v5z_1_AA_1-1562
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1562.
4v5z_1_AA_1-1563
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1563.
6lqm_1_8_1267-4755
Could not find nucleotides of chain 8 in annotation 6lqm.json. Either there is a problem with 6lqm mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6lu8_1_8_1267-4755
Could not find nucleotides of chain 8 in annotation 6lu8.json. Either there is a problem with 6lu8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6lsr_1_8_1267-4755
Could not find nucleotides of chain 8 in annotation 6lsr.json. Either there is a problem with 6lsr mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6lss_1_8_1267-4755
Could not find nucleotides of chain 8 in annotation 6lss.json. Either there is a problem with 6lss mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -5,7 +5,7 @@
# in the database.
# This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
import os, pickle, sqlite3, sys
import os, pickle, sqlite3, shlex, subprocess, sys
import numpy as np
import pandas as pd
import threading as th
......@@ -16,14 +16,13 @@ import matplotlib.patches as mpatches
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
from mpl_toolkits.mplot3d import axes3d
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio import AlignIO, SeqIO
from functools import partial
from multiprocessing import Pool
from multiprocessing import Pool, Manager
from os import path
from tqdm import tqdm
from collections import Counter
from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
# This sets the paths
if len(sys.argv) > 1:
......@@ -37,7 +36,7 @@ else:
LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
"""
Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
See Wadley & Pyle (2007)
......@@ -68,6 +67,12 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
if not path.isfile(f"data/wadley_kernel_{angle}.npz"):
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
# Extract the angle values of c2'-endo and c3'-endo nucleotides
with sqlite3.connect("results/RNANet.db") as conn:
df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
......@@ -89,13 +94,17 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
f_c3 = np.reshape(kernel_c3(positions).T, xx.shape)
pbar.update(1)
f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
pbar.update(1)
# Save the data to an archive for later use without the need to recompute
np.savez(f"data/wadley_kernel_{angle}.npz",
c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
kernel_c3=f_c3, kernel_c2=f_c2)
pbar.close()
idxQueue.put(thr_idx)
else:
f = np.load(f"data/wadley_kernel_{angle}.npz")
c2_endo_etas = f["c2_endo_e"]
......@@ -106,7 +115,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
f_c2 = f["kernel_c2"]
xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
# notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
# exact counts:
hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1),
......@@ -139,7 +148,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png")
if show:
fig.show()
fig.close()
plt.close()
# Smoothed joint distribution
fig = plt.figure()
......@@ -150,7 +159,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png")
if show:
fig.show()
fig.close()
plt.close()
# 2D Wadley plot
fig = plt.figure(figsize=(5,5))
......@@ -163,7 +172,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png")
if show:
fig.show()
fig.close()
plt.close()
# print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
def stats_len():
......@@ -171,11 +180,15 @@ def stats_len():
REQUIRES tables chain, nucleotide up to date.
"""
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
cols = []
lengths = []
conn = sqlite3.connect("results/RNANet.db")
for i,f in enumerate(fam_list):
for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
# Define a color for that family in the plot
if f in LSU_set:
......@@ -190,11 +203,11 @@ def stats_len():
cols.append("grey")
# Get the lengths of chains
l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ]
with sqlite3.connect("results/RNANet.db") as conn:
l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ]
lengths.append(l)
notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
conn.close()
# notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
# Plot the figure
fig = plt.figure(figsize=(10,3))
......@@ -223,7 +236,8 @@ def stats_len():
# Save the figure
fig.savefig("results/figures/lengths.png")
notify("Computed sequence length statistics and saved the figure.")
idxQueue.put(thr_idx) # replace the thread index in the queue
# notify("Computed sequence length statistics and saved the figure.")
def format_percentage(tot, x):
if not tot:
......@@ -242,40 +256,57 @@ def stats_freq():
Outputs results/frequencies.csv
REQUIRES tables chain, nucleotide up to date."""
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
# Initialize a Counter object for each family
freqs = {}
for f in fam_list:
freqs[f] = Counter()
# List all nt_names happening within a RNA family and store the counts in the Counter
conn = sqlite3.connect("results/RNANet.db")
for i,f in enumerate(fam_list):
counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;"))
for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
with sqlite3.connect("results/RNANet.db") as conn:
counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
freqs[f].update(counts)
notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
conn.close()
# notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
# Create a pandas DataFrame, and save it to CSV.
df = pd.DataFrame()
for f in fam_list:
for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
tot = sum(freqs[f].values())
df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
df = df.fillna(0)
df.to_csv("results/frequencies.csv")
notify("Saved nucleotide frequencies to CSV file.")
idxQueue.put(thr_idx) # replace the thread index in the queue
# notify("Saved nucleotide frequencies to CSV file.")
def parallel_stats_pairs(f):
"""Counts occurrences of intra-chain base-pair types in one RNA family
REQUIRES tables chain, nucleotide up-to-date."""
if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"):
return
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
chain_id_list = mappings_list[f]
data = []
for cid in chain_id_list:
sqldata = []
for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
with sqlite3.connect("results/RNANet.db") as conn:
# Get comma separated lists of basepairs per nucleotide
interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn)
interactions = pd.DataFrame(
sql_ask_database(conn,
f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;",
warn_every=0),
columns = ["nt1", "index_chain", "paired", "pair_type_LW"]
)
# expand the comma-separated lists in real lists
expanded_list = pd.concat([ pd.DataFrame({ 'nt1':[ row["nt1"] for x in row["paired"].split(',') ],
'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ],
......@@ -317,27 +348,29 @@ def parallel_stats_pairs(f):
# Update the database
vlcnts = expanded_list.pair_type_LW.value_counts()
sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0,
vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0,
vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0,
vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0,
vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0,
vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0,
vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0,
vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0,
vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0,
vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0,
vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0,
vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0,
int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2),
cid)
with sqlite3.connect("results/RNANet.db") as conn:
sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata)
sqldata.append( ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0,
vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0,
vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0,
vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0,
vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0,
vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0,
vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0,
vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0,
vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0,
vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0,
vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0,
vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0,
int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2),
cid) )
data.append(expanded_list)
# Update the database
with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", many=True, data=sqldata, warn_every=0)
# merge all the dataframes from all chains of the family
expanded_list = pd.concat(data)
......@@ -351,7 +384,106 @@ def parallel_stats_pairs(f):
# Create an output DataFrame
f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
return expanded_list, f_df
f_df.to_csv(f"data/{f}_counts.csv")
expanded_list.to_csv(f"data/{f}_pairs.csv")
idxQueue.put(thr_idx) # replace the thread index in the queue
def to_dist_matrix(f):
if path.isfile("data/"+f+".npy"):
# notify(f"Computed {f} distance matrix", "loaded from file")
return 0
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
# notify(f"Computing {f} distance matrix from alignment...")
command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
# Prepare a file
with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
al = AlignIO.read(al_file, "fasta")
names = [ x.id for x in al if '[' in x.id ]
al = al[-len(names):]
with open(f + "_3d_only.stk", "w") as only_3d:
only_3d.write(al.format("stockholm"))
del al
# Prepare the job
process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
id_matrix = np.zeros((len(names), len(names)))
pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
while process.poll() is None:
output = process.stdout.readline()
if output:
lines = output.strip().split(b'\n')
for l in lines:
line = l.split()
s1 = line[0].decode('utf-8')
s2 = line[1].decode('utf-8')
score = line[2].decode('utf-8')
id1 = names.index(s1)
id2 = names.index(s2)
id_matrix[id1, id2] = float(score)
pbar.update(1)
pbar.close()
subprocess.run(["rm", "-f", f + "_3d_only.stk"])
np.save("data/"+f+".npy", id_matrix)
idxQueue.put(thr_idx) # replace the thread index in the queue
return 0
def seq_idty():
"""Computes identity matrices for each of the RNA families.
REQUIRES temporary results files in data/*.npy
REQUIRES tables chain, family un to date."""
# load distance matrices
fam_arrays = []
for f in famlist:
if path.isfile("data/"+f+".npy"):
fam_arrays.append(np.load("data/"+f+".npy"))
else:
fam_arrays.append([])
# Update database with identity percentages
conn = sqlite3.connect("results/RNANet.db")
for f, D in zip(famlist, fam_arrays):
if not len(D): continue
a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
conn.commit()
conn.close()
# Plots plots plots
fig, axs = plt.subplots(4,17, figsize=(17,5.75))
axs = axs.ravel()
[axi.set_axis_off() for axi in axs]
im = "" # Just to declare the variable, it will be set in the loop
for f, D, ax in zip(famlist, fam_arrays, axs):
if not len(D): continue
if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
condensedD = squareform(D)
# Compute basic dendrogram by Ward's method
Y = sch.linkage(condensedD, method='ward')
Z = sch.dendrogram(Y, orientation='left', no_plot=True)
# Reorganize rows and cols
idx1 = Z['leaves']
D = D[idx1,:]
D = D[:,idx1[::-1]]
im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
fig.tight_layout()
fig.subplots_adjust(wspace=0.1, hspace=0.3)
fig.colorbar(im, ax=axs[-1], shrink=0.8)
fig.savefig(f"results/figures/distances.png")
notify("Computed all identity matrices and saved the figure.")
def stats_pairs():
"""Counts occurrences of intra-chain base-pair types in RNA families
......@@ -363,26 +495,15 @@ def stats_pairs():
return family_data.apply(partial(format_percentage, sum(family_data)))
if not path.isfile("data/pair_counts.csv"):
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=read_cpu_number(), maxtasksperchild=5)
try:
fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True)
results = []
allpairs = []
for _, newp_famdf in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)):
newpairs, fam_df = newp_famdf
fam_pbar.update(1)
results.append(fam_df)
allpairs.append(newpairs)
fam_pbar.close()
p.close()
p.join()
except KeyboardInterrupt:
warn("KeyboardInterrupt, terminating workers.", error=True)
fam_pbar.close()
p.terminate()
p.join()
exit(1)
results = []
allpairs = []
for f in fam_list:
newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
results.append(fam_df)
allpairs.append(newpairs)
subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
all_pairs = pd.concat(allpairs)
df = pd.concat(results).fillna(0)
df.to_csv("data/pair_counts.csv")
......@@ -431,92 +552,12 @@ def stats_pairs():
notify("Computed nucleotide statistics and saved CSV and PNG file.")
def to_dist_matrix(f):
if path.isfile("data/"+f+".npy"):
notify(f"Computed {f} distance matrix", "loaded from file")
return 0
notify(f"Computing {f} distance matrix from alignment...")
dm = DistanceCalculator('identity')
with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
idty = dm.get_distance(al).matrix # list of lists
del al
l = len(idty)
np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ], dtype=object))
del idty
notify(f"Computed {f} distance matrix")
return 0
def seq_idty():
"""Computes identity matrices for each of the RNA families.
Creates temporary results files in data/*.npy
REQUIRES tables chain, family un to date."""
# List the families for which we will compute sequence identity matrices
conn = sqlite3.connect("results/RNANet.db")
famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ]
ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
if len(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
# compute distance matrices (or ignore if data/RF0****.npy exists)
p = Pool(processes=8)
p.map(to_dist_matrix, famlist)
p.close()
p.join()
# load them
fam_arrays = []
for f in famlist:
if path.isfile("data/"+f+".npy"):
fam_arrays.append(np.load("data/"+f+".npy"))
else:
fam_arrays.append([])
# Update database with identity percentages
conn = sqlite3.connect("results/RNANet.db")
for f, D in zip(famlist, fam_arrays):
if not len(D): continue
a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
conn.commit()
conn.close()
# Plots plots plots
fig, axs = plt.subplots(4,17, figsize=(17,5.75))
axs = axs.ravel()
[axi.set_axis_off() for axi in axs]
im = "" # Just to declare the variable, it will be set in the loop
for f, D, ax in zip(famlist, fam_arrays, axs):
if not len(D): continue
if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
condensedD = squareform(D)
# Compute basic dendrogram by Ward's method
Y = sch.linkage(condensedD, method='ward')
Z = sch.dendrogram(Y, orientation='left', no_plot=True)
# Reorganize rows and cols
idx1 = Z['leaves']
D = D[idx1,:]
D = D[:,idx1[::-1]]
im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
fig.tight_layout()
fig.subplots_adjust(wspace=0.1, hspace=0.3)
fig.colorbar(im, ax=axs[-1], shrink=0.8)
fig.savefig(f"results/figures/distances.png")
notify("Computed all identity matrices and saved the figure.")
def per_chain_stats():
"""Computes per-chain frequencies and base-pair type counts.
REQUIRES tables chain, nucleotide up to date. """
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
# Compute per-chain nucleotide frequencies
df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
......@@ -524,39 +565,74 @@ def per_chain_stats():
df = df.drop("total", axis=1)
# Set the values
conn.execute('pragma journal_mode=wal')
sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
many=True, data=list(df.to_records(index=False)), warn_every=10)
notify("Updated the database with per-chain base frequencies")
def log_to_pbar(pbar):
def update(r):
pbar.update(1)
return update
if __name__ == "__main__":
os.makedirs("results/figures/wadley_plots/", exist_ok=True)
print("Loading mappings list...")
conn = sqlite3.connect("results/RNANet.db")
fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
mappings_list = {}
for k in fam_list:
mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ]
conn.close()
# stats_pairs()
# Define threads for the tasks
threads = [
th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
th.Thread(target=stats_len), # computes figures
th.Thread(target=stats_freq), # Updates the database
th.Thread(target=seq_idty), # produces .npy files and seq idty figures
th.Thread(target=per_chain_stats) # Updates the database
]
# Start the threads
for t in threads:
t.start()
# Wait for the threads to complete
for t in threads:
t.join()
with sqlite3.connect("results/RNANet.db") as conn:
fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
mappings_list = {}
for k in fam_list:
mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
# List the families for which we will compute sequence identity matrices
with sqlite3.connect("results/RNANet.db") as conn:
famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
if len(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
# Prepare the multiprocessing execution environment
nworkers = max(read_cpu_number()-1, 32)
thr_idx_mgr = Manager()
idxQueue = thr_idx_mgr.Queue()
for i in range(nworkers):
idxQueue.put(i)
# Define the tasks
joblist = []
joblist.append(Job(function=reproduce_wadley_results, args=(1,)))
joblist.append(Job(function=reproduce_wadley_results, args=(4,)))
joblist.append(Job(function=stats_len)) # Computes figures
joblist.append(Job(function=stats_freq)) # updates the database
for f in famlist:
joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
if f not in ignored:
joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
sqlite3
try:
for j in joblist:
p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar))
p.close()
p.join()
pbar.close()
except KeyboardInterrupt:
warn("KeyboardInterrupt, terminating workers.", error=True)
p.terminate()
p.join()
pbar.close()
exit(1)
except:
print("Something went wrong")
print()
print()
# finish the work after the parallel portions
per_chain_stats()
seq_idty()
stats_pairs()
......