Showing
1 changed file
with
7 additions
and
49 deletions
... | @@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser | ... | @@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser |
40 | from Bio.Seq import MutableSeq | 40 | from Bio.Seq import MutableSeq |
41 | from Bio.SeqRecord import SeqRecord | 41 | from Bio.SeqRecord import SeqRecord |
42 | from Bio.Align import MultipleSeqAlignment | 42 | from Bio.Align import MultipleSeqAlignment |
43 | -from pydca.plmdca import plmdca | 43 | + |
44 | runDir = os.getcwd() | 44 | runDir = os.getcwd() |
45 | 45 | ||
46 | def trace_unhandled_exceptions(func): | 46 | def trace_unhandled_exceptions(func): |
... | @@ -1768,10 +1768,6 @@ def sql_define_tables(conn): | ... | @@ -1768,10 +1768,6 @@ def sql_define_tables(conn): |
1768 | freq_G REAL, | 1768 | freq_G REAL, |
1769 | freq_U REAL, | 1769 | freq_U REAL, |
1770 | freq_other REAL, | 1770 | freq_other REAL, |
1771 | - fields_A REAL, | ||
1772 | - fields_C REAL, | ||
1773 | - fields_G REAL, | ||
1774 | - fields_U REAL, | ||
1775 | gap_percent REAL, | 1771 | gap_percent REAL, |
1776 | consensus CHAR(1), | 1772 | consensus CHAR(1), |
1777 | cons_sec_struct CHAR(1), | 1773 | cons_sec_struct CHAR(1), |
... | @@ -2445,7 +2441,7 @@ def work_pydca(f, columns_to_save): | ... | @@ -2445,7 +2441,7 @@ def work_pydca(f, columns_to_save): |
2445 | """ | 2441 | """ |
2446 | This function writes an alignment file containing only the columns which will be saved to the database, | 2442 | This function writes an alignment file containing only the columns which will be saved to the database, |
2447 | converted to uppercase, and without non-ACGU nucleotides. | 2443 | converted to uppercase, and without non-ACGU nucleotides. |
2448 | - This file in then used by pydca to compute DCA features, and finally removed. | 2444 | + This file in then used by pydca to compute DCA features. |
2449 | """ | 2445 | """ |
2450 | 2446 | ||
2451 | align=read(path_to_seq_data + f"realigned/{f}++.afa") | 2447 | align=read(path_to_seq_data + f"realigned/{f}++.afa") |
... | @@ -2469,44 +2465,6 @@ def work_pydca(f, columns_to_save): | ... | @@ -2469,44 +2465,6 @@ def work_pydca(f, columns_to_save): |
2469 | except ValueError as e: | 2465 | except ValueError as e: |
2470 | warn(e) | 2466 | warn(e) |
2471 | 2467 | ||
2472 | - # PyDCA instance with options, | ||
2473 | - # Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences | ||
2474 | - # The maximum number of iterations is set to 500 for gradient descent | ||
2475 | - # Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers | ||
2476 | - # Reference: | ||
2477 | - # Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics. | ||
2478 | - # 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/ | ||
2479 | - plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", | ||
2480 | - "rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500) | ||
2481 | - number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2 # L*(L-1)/2 where L=len(columns_to_save) | ||
2482 | - | ||
2483 | - # Tuple of two list of tuples | ||
2484 | - # - the first list contains the fields of sites (nucleotides) | ||
2485 | - # - the second contains pairwise fields (2 nucleotides) | ||
2486 | - # linear distance is zero in order to keep all possible pairs | ||
2487 | - # because if linear dist=x>0 the pydca will return position |i-j|>x | ||
2488 | - # which will force us to lose a lot of pairs | ||
2489 | - params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites) | ||
2490 | - | ||
2491 | - # Fröbenius norm with average product correction | ||
2492 | - fn_apc = plmdca_inst.compute_sorted_FN_APC() | ||
2493 | - | ||
2494 | - # Save to file | ||
2495 | - np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc) | ||
2496 | - | ||
2497 | - # A dictionary to be used in the function where the frequencies are stored in align_column table | ||
2498 | - return_dict_fields={} | ||
2499 | - for list_fields in params[0]: | ||
2500 | - # The element at 0 is the index | ||
2501 | - # So taking the value from column to save at that index will give us | ||
2502 | - # the fields to be stored at align_column in the table | ||
2503 | - return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1] | ||
2504 | - | ||
2505 | - # Cleanup | ||
2506 | - subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"]) | ||
2507 | - | ||
2508 | - return return_dict_fields | ||
2509 | - | ||
2510 | @trace_unhandled_exceptions | 2468 | @trace_unhandled_exceptions |
2511 | def work_pssm_remap(f): | 2469 | def work_pssm_remap(f): |
2512 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. | 2470 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. |
... | @@ -2719,18 +2677,18 @@ def work_pssm_remap(f): | ... | @@ -2719,18 +2677,18 @@ def work_pssm_remap(f): |
2719 | 2677 | ||
2720 | setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA") | 2678 | setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA") |
2721 | 2679 | ||
2722 | - rfam_fields_record = work_pydca(f, columns) | 2680 | + work_pydca(f, sorted(columns_to_save)) |
2723 | 2681 | ||
2724 | - data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)] | 2682 | + data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)] |
2725 | - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct) | 2683 | + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) |
2726 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO | 2684 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO |
2727 | UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, | 2685 | UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, |
2728 | - freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U, | 2686 | + freq_other=excluded.freq_other, |
2729 | gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) | 2687 | gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) |
2730 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) | 2688 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) |
2731 | sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, | 2689 | sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, |
2732 | fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct) | 2690 | fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct) |
2733 | - VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,)) | 2691 | + VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,)) |
2734 | 2692 | ||
2735 | 2693 | ||
2736 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) | 2694 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) | ... | ... |
-
Please register or login to post a comment