# print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
defstats_len():
...
...
@@ -171,11 +180,15 @@ def stats_len():
REQUIRES tables chain, nucleotide up to date.
"""
# Get a worker number to position the progress bar
globalidxQueue
thr_idx=idxQueue.get()
cols=[]
lengths=[]
conn=sqlite3.connect("results/RNANet.db")
fori,finenumerate(fam_list):
fori,finenumerate(tqdm(fam_list,position=thr_idx+1,desc=f"Worker {thr_idx+1}: Average chain lengths",leave=False)):
# Define a color for that family in the plot
iffinLSU_set:
...
...
@@ -190,11 +203,11 @@ def stats_len():
cols.append("grey")
# Get the lengths of chains
l=[x[0]forxinsql_ask_database(conn,f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;")]
withsqlite3.connect("results/RNANet.db")asconn:
l=[x[0]forxinsql_ask_database(conn,f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;",warn_every=0)]
notify("Computed sequence length statistics and saved the figure.")
idxQueue.put(thr_idx)# replace the thread index in the queue
# notify("Computed sequence length statistics and saved the figure.")
defformat_percentage(tot,x):
ifnottot:
...
...
@@ -242,40 +256,54 @@ def stats_freq():
Outputs results/frequencies.csv
REQUIRES tables chain, nucleotide up to date."""
# Get a worker number to position the progress bar
globalidxQueue
thr_idx=idxQueue.get()
# Initialize a Counter object for each family
freqs={}
forfinfam_list:
freqs[f]=Counter()
# List all nt_names happening within a RNA family and store the counts in the Counter
conn=sqlite3.connect("results/RNANet.db")
fori,finenumerate(fam_list):
counts=dict(sql_ask_database(conn,f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;"))
fori,finenumerate(tqdm(fam_list,position=thr_idx+1,desc=f"Worker {thr_idx+1}: Base frequencies",leave=False)):
withsqlite3.connect("results/RNANet.db")asconn:
counts=dict(sql_ask_database(conn,f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;",warn_every=0))
# Get comma separated lists of basepairs per nucleotide
interactions=pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;",conn)
interactions=pd.DataFrame(
sql_ask_database(conn,
f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;",
"""Computes identity matrices for each of the RNA families.
Creates temporary results files in data/*.npy
REQUIRES tables chain, family un to date."""
# List the families for which we will compute sequence identity matrices
conn=sqlite3.connect("results/RNANet.db")
famlist=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;")]
ignored=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;")]
iflen(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:"," ".join(ignored)+'\n')
# compute distance matrices (or ignore if data/RF0****.npy exists)
p=Pool(processes=8)
p.map(to_dist_matrix,famlist)
p.close()
p.join()
# load them
fam_arrays=[]
forfinfamlist:
ifpath.isfile("data/"+f+".npy"):
fam_arrays.append(np.load("data/"+f+".npy"))
else:
fam_arrays.append([])
# Update database with identity percentages
conn=sqlite3.connect("results/RNANet.db")
forf,Dinzip(famlist,fam_arrays):
ifnotlen(D):continue
a=1.0-np.average(D+D.T)# Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
conn.commit()
conn.close()
# Plots plots plots
fig,axs=plt.subplots(4,17,figsize=(17,5.75))
axs=axs.ravel()
[axi.set_axis_off()foraxiinaxs]
im=""# Just to declare the variable, it will be set in the loop
forf,D,axinzip(famlist,fam_arrays,axs):
ifnotlen(D):continue
ifD.shape[0]>2:# Cluster only if there is more than 2 sequences to organize
D=D+D.T# Copy the lower triangle to upper, to get a symetrical matrix
th.Thread(target=stats_freq),# Updates the database
th.Thread(target=seq_idty),# produces .npy files and seq idty figures
th.Thread(target=per_chain_stats)# Updates the database
]
# Start the threads
fortinthreads:
t.start()
# Wait for the threads to complete
fortinthreads:
t.join()
withsqlite3.connect("results/RNANet.db")asconn:
fam_list=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from family ORDER BY rfam_acc ASC;")]
mappings_list={}
forkinfam_list:
mappings_list[k]=[x[0]forxinsql_ask_database(conn,f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;")]
# List the families for which we will compute sequence identity matrices
withsqlite3.connect("results/RNANet.db")asconn:
famlist=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;")]
ignored=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;")]
iflen(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:"," ".join(ignored)+'\n')
# Prepare the multiprocessing execution environment