Louis BECQUEY

Latest statistics on basepair counts by chain

......@@ -3,7 +3,7 @@ nohup.out
log_of_the_run.sh
# results
results/figures/wadley_plots/
results/
# temporary results files
data/
......
......@@ -1179,7 +1179,7 @@ class Pipeline:
os.makedirs(runDir + "/results/archive/")
# Save to by-chain CSV files
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1, maxtasksperchild=5)
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
try:
pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
for i, _ in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
......@@ -1208,7 +1208,7 @@ class Pipeline:
# Save additional informations
conn = sqlite3.connect(runDir+"/results/RNANet.db")
pd.read_sql_query("SELECT rfam_acc, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family",
pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure
JOIN chain ON structure.pdb_id = chain.structure_id
......@@ -1274,6 +1274,7 @@ class Pipeline:
conn.close()
def read_cpu_number():
# As one shall not use os.cpu_count() on LXC containers,
# because it reads info from /sys wich is not the VM resources but the host resources.
......@@ -2050,6 +2051,7 @@ def work_pssm(f, fill_gaps):
idxQueue.put(thr_idx) # replace the thread index in the queue
return 0
@trace_unhandled_exceptions
def work_save(c, homology=True):
conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0)
if homology:
......@@ -2096,38 +2098,36 @@ if __name__ == "__main__":
sql_define_tables(conn)
print("> Storing results into", runDir + "/results/RNANet.db")
# # compute an update compared to what is in the table "chain"
# #DEBUG: list everything
# pp.REUSE_ALL = True
# pp.list_available_mappings()
# # ===========================================================================
# # 3D information
# # ===========================================================================
# # Download and annotate new RNA 3D chains (Chain objects in pp.update)
# pp.dl_and_annotate(coeff_ncores=0.75)
# # At this point, the structure table is up to date
# pp.build_chains(coeff_ncores=2.0)
# if len(pp.to_retry):
# # Redownload and re-annotate
# print("> Retrying to annotate some structures which just failed.", flush=True)
# pp.dl_and_annotate(retry=True, coeff_ncores=0.5) #
# pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory
# print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).")
# pp.checkpoint_save_chains()
# if not pp.HOMOLOGY:
# # Save chains to file
# for c in pp.loaded_chains:
# work_save(c, homology=False)
# print("Completed.")
# exit()
# compute an update compared to what is in the table "chain"
pp.list_available_mappings()
# ===========================================================================
# 3D information
# ===========================================================================
# Download and annotate new RNA 3D chains (Chain objects in pp.update)
pp.dl_and_annotate(coeff_ncores=0.5)
# At this point, the structure table is up to date
pp.build_chains(coeff_ncores=2.0)
if len(pp.to_retry):
# Redownload and re-annotate
print("> Retrying to annotate some structures which just failed.", flush=True)
pp.dl_and_annotate(retry=True, coeff_ncores=0.3) #
pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory
print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).")
pp.checkpoint_save_chains()
if not pp.HOMOLOGY:
# Save chains to file
for c in pp.loaded_chains:
work_save(c, homology=False)
print("Completed.")
exit()
# # At this point, structure, chain and nucleotide tables of the database are up to date.
# # (Modulo some statistics computed by statistics.py)
# At this point, structure, chain and nucleotide tables of the database are up to date.
# (Modulo some statistics computed by statistics.py)
# ===========================================================================
# Homology information
......
......@@ -26,11 +26,13 @@ from collections import Counter
from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
# This sets the paths
path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
if len(sys.argv) > 1:
path_to_3D_data = path.abspath(sys.argv[1])
path_to_seq_data = path.abspath(sys.argv[2])
else:
print("Please set paths to 3D data using command line arguments:")
print("./statistics.py /path/to/3D/data/ /path/to/sequence/data/")
exit()
LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
......@@ -289,19 +291,32 @@ def parallel_stats_pairs(f):
np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other")
)
)
# checks
# ct = pd.crosstab(expanded_list.pair_type_LW, expanded_list.basepair)
# ct = ct.loc[[ x for x in ["cWW","cHH","cSS","tWW","tHH","tSS"] if x in ct.index ]]
# for _, symmetric_type in ct.iterrows():
# for x in symmetric_type:
# if x%2:
# print("Odd number found for", symmetric_type.name, "in chain", cid, flush=True)
# print(expanded_list, flush=True)
# exit()
expanded_list = expanded_list[["basepair", "pair_type_LW"]]
# Update the database
vlcnts = expanded_list.pair_type_LW.value_counts()
sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0,
vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0,
vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0,
vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0,
vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0,
vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0,
vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0,
vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0,
vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0,
vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0,
vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0,
vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0,
int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2),
cid)
with sqlite3.connect("results/RNANet.db") as conn:
sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata)
data.append(expanded_list)
# merge all the dataframes from all chains of the family
expanded_list = pd.concat(data)
......@@ -336,17 +351,6 @@ def stats_pairs():
fam_pbar.update(1)
results.append(fam_df)
allpairs.append(newpairs)
# Checks
vlcnts= newpairs.pair_type_LW.value_counts()
identical = [fam_df[i][0] == newpairs.pair_type_LW.value_counts().at[i] for i in fam_df.columns]
if False in identical:
print(fam_df)
print(vlcnts)
print("Dataframes differ for",fam_df.index[0], flush=True)
for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
if x in vlcnts.index and vlcnts[x] % 2:
print("Trouvé un nombre impair de",x,"dans",fam_df.index[0], flush=True)
fam_pbar.close()
p.close()
p.join()
......@@ -359,10 +363,6 @@ def stats_pairs():
all_pairs = pd.concat(allpairs)
df = pd.concat(results).fillna(0)
vlcnts= all_pairs.pair_type_LW.value_counts()
for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
if x in vlcnts.index and vlcnts[x] % 2:
print("Trouvé un nombre impair de",x,"après le merge !", flush=True)
df.to_csv("data/pair_counts.csv")
all_pairs.to_csv("data/all_pairs.csv")
else:
......@@ -375,18 +375,16 @@ def stats_pairs():
# Remove not very well defined pair types (not in the 12 LW types)
df['other'] = df[col_list].sum(axis=1)
df.drop(col_list, axis=1, inplace=True)
crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("Other"))
crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("non-LW"))
# drop duplicate types
# The twelve Leontis-Westhof types are
# cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites)
# tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites)
df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1)
crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","Other"] if x in crosstab.index]]
df = df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1)
crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","non-LW"] if x in crosstab.index]]
df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2
# crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "Other"]] /= 2
print(crosstab)
print(df)
crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "non-LW"]] /= 2
# Compute total row
total_series = df.sum(numeric_only=True).rename("TOTAL")
......@@ -397,15 +395,16 @@ def stats_pairs():
# reorder columns
df.sort_values("TOTAL", axis=1, inplace=True, ascending=False)
crosstab = crosstab[["AU", "GC", "Wobble", "Other"]]
# Save to CSV
df.to_csv("results/pairings.csv")
df.to_csv("results/pair_types.csv")
# Plot barplot of overall types
total_series.sort_values(ascending=False, inplace=True)
ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) )
ax.set_ylabel("Number of observations")
plt.subplots_adjust(bottom=0.2, right=0.99)
ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13)
ax.set_ylabel("Number of observations (millions)", fontsize=13)
ax.set_xlabel(None)
plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
plt.savefig("results/figures/pairings.png")
notify("Computed nucleotide statistics and saved CSV and PNG file.")
......@@ -416,7 +415,7 @@ def to_dist_matrix(f):
return 0
dm = DistanceCalculator('identity')
with open(path_to_seq_data+"realigned/"+f+"++.afa") as al_file:
with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
idty = dm.get_distance(al).matrix # list of lists
del al
......@@ -457,7 +456,7 @@ def seq_idty():
for f, D in zip(famlist, fam_arrays):
if not len(D): continue
a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
conn.execute(f"UPDATE family SET idty_percent = {float(a)} WHERE rfam_acc = '{f}';")
conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
conn.commit()
conn.close()
......