Khodor HANNOUSH

Separated pydca from RNANet

Showing 1 changed file with 7 additions and 49 deletions
...@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser ...@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
40 from Bio.Seq import MutableSeq 40 from Bio.Seq import MutableSeq
41 from Bio.SeqRecord import SeqRecord 41 from Bio.SeqRecord import SeqRecord
42 from Bio.Align import MultipleSeqAlignment 42 from Bio.Align import MultipleSeqAlignment
43 -from pydca.plmdca import plmdca 43 +
44 runDir = os.getcwd() 44 runDir = os.getcwd()
45 45
46 def trace_unhandled_exceptions(func): 46 def trace_unhandled_exceptions(func):
...@@ -1768,10 +1768,6 @@ def sql_define_tables(conn): ...@@ -1768,10 +1768,6 @@ def sql_define_tables(conn):
1768 freq_G REAL, 1768 freq_G REAL,
1769 freq_U REAL, 1769 freq_U REAL,
1770 freq_other REAL, 1770 freq_other REAL,
1771 - fields_A REAL,
1772 - fields_C REAL,
1773 - fields_G REAL,
1774 - fields_U REAL,
1775 gap_percent REAL, 1771 gap_percent REAL,
1776 consensus CHAR(1), 1772 consensus CHAR(1),
1777 cons_sec_struct CHAR(1), 1773 cons_sec_struct CHAR(1),
...@@ -2445,7 +2441,7 @@ def work_pydca(f, columns_to_save): ...@@ -2445,7 +2441,7 @@ def work_pydca(f, columns_to_save):
2445 """ 2441 """
2446 This function writes an alignment file containing only the columns which will be saved to the database, 2442 This function writes an alignment file containing only the columns which will be saved to the database,
2447 converted to uppercase, and without non-ACGU nucleotides. 2443 converted to uppercase, and without non-ACGU nucleotides.
2448 - This file in then used by pydca to compute DCA features, and finally removed. 2444 + This file in then used by pydca to compute DCA features.
2449 """ 2445 """
2450 2446
2451 align=read(path_to_seq_data + f"realigned/{f}++.afa") 2447 align=read(path_to_seq_data + f"realigned/{f}++.afa")
...@@ -2469,44 +2465,6 @@ def work_pydca(f, columns_to_save): ...@@ -2469,44 +2465,6 @@ def work_pydca(f, columns_to_save):
2469 except ValueError as e: 2465 except ValueError as e:
2470 warn(e) 2466 warn(e)
2471 2467
2472 - # PyDCA instance with options,
2473 - # Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences
2474 - # The maximum number of iterations is set to 500 for gradient descent
2475 - # Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
2476 - # Reference:
2477 - # Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics.
2478 - # 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
2479 - plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",
2480 - "rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500)
2481 - number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2 # L*(L-1)/2 where L=len(columns_to_save)
2482 -
2483 - # Tuple of two list of tuples
2484 - # - the first list contains the fields of sites (nucleotides)
2485 - # - the second contains pairwise fields (2 nucleotides)
2486 - # linear distance is zero in order to keep all possible pairs
2487 - # because if linear dist=x>0 the pydca will return position |i-j|>x
2488 - # which will force us to lose a lot of pairs
2489 - params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites)
2490 -
2491 - # Fröbenius norm with average product correction
2492 - fn_apc = plmdca_inst.compute_sorted_FN_APC()
2493 -
2494 - # Save to file
2495 - np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc)
2496 -
2497 - # A dictionary to be used in the function where the frequencies are stored in align_column table
2498 - return_dict_fields={}
2499 - for list_fields in params[0]:
2500 - # The element at 0 is the index
2501 - # So taking the value from column to save at that index will give us
2502 - # the fields to be stored at align_column in the table
2503 - return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1]
2504 -
2505 - # Cleanup
2506 - subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
2507 -
2508 - return return_dict_fields
2509 -
2510 @trace_unhandled_exceptions 2468 @trace_unhandled_exceptions
2511 def work_pssm_remap(f): 2469 def work_pssm_remap(f):
2512 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. 2470 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
...@@ -2719,18 +2677,18 @@ def work_pssm_remap(f): ...@@ -2719,18 +2677,18 @@ def work_pssm_remap(f):
2719 2677
2720 setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA") 2678 setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA")
2721 2679
2722 - rfam_fields_record = work_pydca(f, columns) 2680 + work_pydca(f, sorted(columns_to_save))
2723 2681
2724 - data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)] 2682 + data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
2725 - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct) 2683 + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
2726 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 2684 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
2727 UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 2685 UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
2728 - freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U, 2686 + freq_other=excluded.freq_other,
2729 gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) 2687 gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
2730 # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) 2688 # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
2731 sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, 2689 sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other,
2732 fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct) 2690 fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
2733 - VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,)) 2691 + VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
2734 2692
2735 2693
2736 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) 2694 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
......