Showing
2 changed files
with
102 additions
and
26 deletions
This diff is collapsed. Click to expand it.
... | @@ -175,7 +175,7 @@ def stats_len(): | ... | @@ -175,7 +175,7 @@ def stats_len(): |
175 | cols.append("orange") | 175 | cols.append("orange") |
176 | else: | 176 | else: |
177 | cols.append("grey") | 177 | cols.append("grey") |
178 | - l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(nt_id) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ] | 178 | + l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ] |
179 | lengths.append(l) | 179 | lengths.append(l) |
180 | notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") | 180 | notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") |
181 | conn.close() | 181 | conn.close() |
... | @@ -245,32 +245,83 @@ def parallel_stats_pairs(f): | ... | @@ -245,32 +245,83 @@ def parallel_stats_pairs(f): |
245 | 245 | ||
246 | REQUIRES tables chain, nucleotide up-to-date.""" | 246 | REQUIRES tables chain, nucleotide up-to-date.""" |
247 | 247 | ||
248 | - with sqlite3.connect("results/RNANet.db") as conn: | 248 | + chain_id_list = mappings_list[f] |
249 | - # Get comma separated lists of basepairs per nucleotide | 249 | + data = [] |
250 | - interactions = pd.read_sql(f"SELECT paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide WHERE nb_interact>0;", conn) | 250 | + for cid in chain_id_list: |
251 | - | 251 | + with sqlite3.connect("results/RNANet.db") as conn: |
252 | - # expand the comma-separated lists in real lists | 252 | + # Get comma separated lists of basepairs per nucleotide |
253 | - expanded_list = pd.concat([ pd.DataFrame({ 'paired':row['paired'].split(','), 'pair_type_LW':row['pair_type_LW'].split(',') }) | 253 | + interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn) |
254 | - for _, row in interactions.iterrows() ]).reset_index(drop=True) | 254 | + |
255 | - # keep only intra-chain interactions | 255 | + # expand the comma-separated lists in real lists |
256 | - expanded_list = expanded_list[ expanded_list.paired != '0' ].pair_type_LW | 256 | + expanded_list = pd.concat([ pd.DataFrame({ 'nt1':[ row["nt1"] for x in row["paired"].split(',') ], |
257 | + 'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ], | ||
258 | + 'paired':row['paired'].split(','), | ||
259 | + 'pair_type_LW':row['pair_type_LW'].split(',') | ||
260 | + }) | ||
261 | + for _, row in interactions.iterrows() | ||
262 | + ]).reset_index(drop=True) | ||
263 | + | ||
264 | + # Add second nucleotide | ||
265 | + nt2 = [] | ||
266 | + for _, row in expanded_list.iterrows(): | ||
267 | + if row.paired in ['', '0']: | ||
268 | + nt2.append('') | ||
269 | + else: | ||
270 | + try: | ||
271 | + n = expanded_list[expanded_list.index_chain == int(row.paired)].nt1.tolist()[0] | ||
272 | + nt2.append(n) | ||
273 | + except IndexError: | ||
274 | + print(cid, flush=True) | ||
275 | + try: | ||
276 | + expanded_list["nt2"] = nt2 | ||
277 | + except ValueError: | ||
278 | + print(cid, flush=True) | ||
279 | + print(expanded_list, flush=True) | ||
280 | + return 0,0 | ||
281 | + | ||
282 | + # keep only intra-chain interactions | ||
283 | + expanded_list = expanded_list[ ~expanded_list.paired.isin(['0','']) ] | ||
284 | + expanded_list["nts"] = expanded_list["nt1"] + expanded_list["nt2"] | ||
285 | + | ||
286 | + # Get basepair type | ||
287 | + expanded_list["basepair"] = np.where(expanded_list.nts.isin(["AU","UA"]), "AU", | ||
288 | + np.where(expanded_list.nts.isin(["GC","CG"]), "GC", | ||
289 | + np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other") | ||
290 | + ) | ||
291 | + ) | ||
292 | + # checks | ||
293 | + # ct = pd.crosstab(expanded_list.pair_type_LW, expanded_list.basepair) | ||
294 | + # ct = ct.loc[[ x for x in ["cWW","cHH","cSS","tWW","tHH","tSS"] if x in ct.index ]] | ||
295 | + # for _, symmetric_type in ct.iterrows(): | ||
296 | + # for x in symmetric_type: | ||
297 | + # if x%2: | ||
298 | + # print("Odd number found for", symmetric_type.name, "in chain", cid, flush=True) | ||
299 | + # print(expanded_list, flush=True) | ||
300 | + # exit() | ||
301 | + | ||
302 | + expanded_list = expanded_list[["basepair", "pair_type_LW"]] | ||
303 | + data.append(expanded_list) | ||
304 | + | ||
305 | + # merge all the dataframes from all chains of the family | ||
306 | + expanded_list = pd.concat(data) | ||
257 | 307 | ||
258 | # Count each pair type | 308 | # Count each pair type |
259 | - vcnts = expanded_list.value_counts() | 309 | + vcnts = expanded_list.pair_type_LW.value_counts() |
260 | 310 | ||
261 | # Add these new counts to the family's counter | 311 | # Add these new counts to the family's counter |
262 | cnt = Counter() | 312 | cnt = Counter() |
263 | cnt.update(dict(vcnts)) | 313 | cnt.update(dict(vcnts)) |
264 | 314 | ||
265 | # Create an output DataFrame | 315 | # Create an output DataFrame |
266 | - return pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) | 316 | + f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) |
317 | + return expanded_list, f_df | ||
267 | 318 | ||
268 | def stats_pairs(): | 319 | def stats_pairs(): |
269 | """Counts occurrences of intra-chain base-pair types in RNA families | 320 | """Counts occurrences of intra-chain base-pair types in RNA families |
270 | 321 | ||
271 | Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv. | 322 | Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv. |
272 | REQUIRES tables chain, nucleotide up-to-date.""" | 323 | REQUIRES tables chain, nucleotide up-to-date.""" |
273 | - | 324 | + |
274 | def line_format(family_data): | 325 | def line_format(family_data): |
275 | return family_data.apply(partial(format_percentage, sum(family_data))) | 326 | return family_data.apply(partial(format_percentage, sum(family_data))) |
276 | 327 | ||
... | @@ -279,9 +330,23 @@ def stats_pairs(): | ... | @@ -279,9 +330,23 @@ def stats_pairs(): |
279 | try: | 330 | try: |
280 | fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) | 331 | fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) |
281 | results = [] | 332 | results = [] |
282 | - for i, fam_df in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)): | 333 | + allpairs = [] |
334 | + for i, _ in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)): | ||
335 | + newpairs, fam_df = _ | ||
283 | fam_pbar.update(1) | 336 | fam_pbar.update(1) |
284 | results.append(fam_df) | 337 | results.append(fam_df) |
338 | + allpairs.append(newpairs) | ||
339 | + | ||
340 | + # Checks | ||
341 | + vlcnts= newpairs.pair_type_LW.value_counts() | ||
342 | + identical = [fam_df[i][0] == newpairs.pair_type_LW.value_counts().at[i] for i in fam_df.columns] | ||
343 | + if False in identical: | ||
344 | + print(fam_df) | ||
345 | + print(vlcnts) | ||
346 | + print("Dataframes differ for",fam_df.index[0], flush=True) | ||
347 | + for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]: | ||
348 | + if x in vlcnts.index and vlcnts[x] % 2: | ||
349 | + print("Trouvé un nombre impair de",x,"dans",fam_df.index[0], flush=True) | ||
285 | fam_pbar.close() | 350 | fam_pbar.close() |
286 | p.close() | 351 | p.close() |
287 | p.join() | 352 | p.join() |
... | @@ -292,24 +357,36 @@ def stats_pairs(): | ... | @@ -292,24 +357,36 @@ def stats_pairs(): |
292 | p.join() | 357 | p.join() |
293 | exit(1) | 358 | exit(1) |
294 | 359 | ||
360 | + all_pairs = pd.concat(allpairs) | ||
295 | df = pd.concat(results).fillna(0) | 361 | df = pd.concat(results).fillna(0) |
362 | + vlcnts= all_pairs.pair_type_LW.value_counts() | ||
363 | + for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]: | ||
364 | + if x in vlcnts.index and vlcnts[x] % 2: | ||
365 | + print("Trouvé un nombre impair de",x,"après le merge !", flush=True) | ||
296 | df.to_csv("data/pair_counts.csv") | 366 | df.to_csv("data/pair_counts.csv") |
367 | + all_pairs.to_csv("data/all_pairs.csv") | ||
297 | else: | 368 | else: |
298 | df = pd.read_csv("data/pair_counts.csv", index_col=0) | 369 | df = pd.read_csv("data/pair_counts.csv", index_col=0) |
370 | + all_pairs = pd.read_csv("data/all_pairs.csv", index_col=0) | ||
299 | 371 | ||
300 | - print(df) | 372 | + crosstab = pd.crosstab(all_pairs.pair_type_LW, all_pairs.basepair) |
301 | - # Remove not very well defined pair types (not in the 12 LW types) | ||
302 | col_list = [ x for x in df.columns if '.' in x ] | 373 | col_list = [ x for x in df.columns if '.' in x ] |
374 | + | ||
375 | + # Remove not very well defined pair types (not in the 12 LW types) | ||
303 | df['other'] = df[col_list].sum(axis=1) | 376 | df['other'] = df[col_list].sum(axis=1) |
304 | df.drop(col_list, axis=1, inplace=True) | 377 | df.drop(col_list, axis=1, inplace=True) |
305 | - print(df) | 378 | + crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("Other")) |
306 | - | 379 | + |
307 | # drop duplicate types | 380 | # drop duplicate types |
308 | # The twelve Leontis-Westhof types are | 381 | # The twelve Leontis-Westhof types are |
309 | # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites) | 382 | # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites) |
310 | # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites) | 383 | # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites) |
311 | - df.drop([ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"], axis=1) | 384 | + df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1) |
312 | - df.loc[ ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] ] /= 2.0 | 385 | + crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","Other"] if x in crosstab.index]] |
386 | + df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2 | ||
387 | + # crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "Other"]] /= 2 | ||
388 | + print(crosstab) | ||
389 | + print(df) | ||
313 | 390 | ||
314 | # Compute total row | 391 | # Compute total row |
315 | total_series = df.sum(numeric_only=True).rename("TOTAL") | 392 | total_series = df.sum(numeric_only=True).rename("TOTAL") |
... | @@ -326,7 +403,6 @@ def stats_pairs(): | ... | @@ -326,7 +403,6 @@ def stats_pairs(): |
326 | 403 | ||
327 | # Plot barplot of overall types | 404 | # Plot barplot of overall types |
328 | total_series.sort_values(ascending=False, inplace=True) | 405 | total_series.sort_values(ascending=False, inplace=True) |
329 | - total_series.apply(lambda x: x/2.0) # each interaction was counted twice because one time per extremity | ||
330 | ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) ) | 406 | ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) ) |
331 | ax.set_ylabel("Number of observations") | 407 | ax.set_ylabel("Number of observations") |
332 | plt.subplots_adjust(bottom=0.2, right=0.99) | 408 | plt.subplots_adjust(bottom=0.2, right=0.99) |
... | @@ -445,11 +521,11 @@ if __name__ == "__main__": | ... | @@ -445,11 +521,11 @@ if __name__ == "__main__": |
445 | 521 | ||
446 | # Define threads for the tasks | 522 | # Define threads for the tasks |
447 | threads = [ | 523 | threads = [ |
448 | - # th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), | 524 | + th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), |
449 | - # th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), | 525 | + th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), |
450 | - # th.Thread(target=stats_len), | 526 | + th.Thread(target=stats_len), |
451 | - # th.Thread(target=stats_freq), | 527 | + th.Thread(target=stats_freq), |
452 | - # th.Thread(target=seq_idty), | 528 | + th.Thread(target=seq_idty), |
453 | th.Thread(target=per_chain_stats) | 529 | th.Thread(target=per_chain_stats) |
454 | ] | 530 | ] |
455 | 531 | ... | ... |
-
Please register or login to post a comment