Separated pydca from RNANet

Khodor HANNOUSH
Commit 8b6b41c82fe3e1a648edb5a1810c0eecd0630a3b 8b6b41c8 1 parent fa394f4a
Showing 1 changed file with 7 additions and 49 deletions
RNAnet.py
--- a/RNAnet.py
View file @8b6b41c
+++ b/RNAnet.py
View file @8b6b41c
@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
 from Bio.Seq import MutableSeq
 from Bio.SeqRecord import SeqRecord
 from Bio.Align import MultipleSeqAlignment
- from pydca.plmdca import plmdca
+ 
 runDir = os.getcwd()
 
 def trace_unhandled_exceptions(func):
@@ -1768,10 +1768,6 @@ def sql_define_tables(conn):
                 freq_G          REAL,
                 freq_U          REAL,
                 freq_other      REAL,
-                 fields_A        REAL,
-                 fields_C        REAL,
-                 fields_G        REAL,
-                 fields_U        REAL,
                 gap_percent     REAL,
                 consensus       CHAR(1),
                 cons_sec_struct CHAR(1),
@@ -2445,7 +2441,7 @@ def work_pydca(f, columns_to_save):
     """
     This function writes an alignment file containing only the columns which will be saved to the database,
     converted to uppercase, and without non-ACGU nucleotides.
-     This file in then used by pydca to compute DCA features, and finally removed.
+     This file in then used by pydca to compute DCA features.
     """
     
     align=read(path_to_seq_data + f"realigned/{f}++.afa")
@@ -2469,44 +2465,6 @@ def work_pydca(f, columns_to_save):
         except ValueError as e:
             warn(e)
     
-     # PyDCA instance with options,
-     # Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences
-     # The maximum number of iterations is set to 500 for gradient descent
-     # Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
-     # Reference:
-     # Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics. 
-     # 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
-     plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",  
-                                 "rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500)
-     number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2    # L*(L-1)/2 where L=len(columns_to_save)
-     
-     # Tuple of two list of tuples
-     # - the first list contains the fields of sites (nucleotides)
-     # - the second contains pairwise fields (2 nucleotides)
-     # linear distance is zero in order to keep all possible pairs 
-     # because if linear dist=x>0 the pydca will return position |i-j|>x
-     # which will force us to lose a lot of pairs
-     params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites)
- 
-     # Fröbenius norm with average product correction
-     fn_apc = plmdca_inst.compute_sorted_FN_APC()
- 
-     # Save to file
-     np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc)
- 
-     # A dictionary to be used in the function where the frequencies are stored in align_column table
-     return_dict_fields={}
-     for list_fields in params[0]:
-         # The element at 0 is the index 
-         # So taking the value from column to save at that index will give us 
-         # the fields to be stored at align_column in the table
-         return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1]
- 
-     # Cleanup
-     subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
- 
-     return return_dict_fields
- 
 @trace_unhandled_exceptions
 def work_pssm_remap(f):
     """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
@@ -2719,18 +2677,18 @@ def work_pssm_remap(f):
 
     setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA")
 
-     rfam_fields_record = work_pydca(f, columns)
+     work_pydca(f, sorted(columns_to_save))
 
-     data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
-     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
+     data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
+     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
                          UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 
-                                     freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U,
+                                     freq_other=excluded.freq_other,
                                     gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
     # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
     sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other,
                           fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
-                           VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
+                           VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
     
     
     # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)