Separated pydca from RNANet

Khodor HANNOUSH
Commit 8b6b41c82fe3e1a648edb5a1810c0eecd0630a3b 8b6b41c8 1 parent fa394f4a
Showing 1 changed file with 7 additions and 49 deletions
RNAnet.py
--- a/RNAnet.py
View file @8b6b41c
+++ b/RNAnet.py
View file @8b6b41c
@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
 from Bio.Seq import MutableSeq
 from Bio.SeqRecord import SeqRecord
 from Bio.Align import MultipleSeqAlignment
-from pydca.plmdca import plmdca
+
 runDir = os.getcwd()
 def trace_unhandled_exceptions(func):
@@ -1768,10 +1768,6 @@ def sql_define_tables(conn):
                 freq_G          REAL,
                 freq_U          REAL,
                 freq_other      REAL,
-                fields_A        REAL,
-                fields_C        REAL,
-                fields_G        REAL,
-                fields_U        REAL,
                 gap_percent     REAL,
                 consensus       CHAR(1),
                 cons_sec_struct CHAR(1),
@@ -2445,7 +2441,7 @@ def work_pydca(f, columns_to_save):
     """
     This function writes an alignment file containing only the columns which will be saved to the database,
     converted to uppercase, and without non-ACGU nucleotides.
-    This file in then used by pydca to compute DCA features, and finally removed.
+    This file in then used by pydca to compute DCA features.
     """
     align=read(path_to_seq_data + f"realigned/{f}++.afa")
@@ -2469,44 +2465,6 @@ def work_pydca(f, columns_to_save):
         except ValueError as e:
             warn(e)
-    # PyDCA instance with options,
-    # Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences
-    # The maximum number of iterations is set to 500 for gradient descent
-    # Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
-    # Reference:
-    # Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics. 
-    # 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
-    plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",  
-                                "rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500)
-    number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2    # L*(L-1)/2 where L=len(columns_to_save)
-    
-    # Tuple of two list of tuples
-    # - the first list contains the fields of sites (nucleotides)
-    # - the second contains pairwise fields (2 nucleotides)
-    # linear distance is zero in order to keep all possible pairs 
-    # because if linear dist=x>0 the pydca will return position |i-j|>x
-    # which will force us to lose a lot of pairs
-    params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites)
-
-    # Fröbenius norm with average product correction
-    fn_apc = plmdca_inst.compute_sorted_FN_APC()
-
-    # Save to file
-    np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc)
-
-    # A dictionary to be used in the function where the frequencies are stored in align_column table
-    return_dict_fields={}
-    for list_fields in params[0]:
-        # The element at 0 is the index 
-        # So taking the value from column to save at that index will give us 
-        # the fields to be stored at align_column in the table
-        return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1]
-
-    # Cleanup
-    subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
-
-    return return_dict_fields
-
 @trace_unhandled_exceptions
 def work_pssm_remap(f):
     """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
@@ -2719,18 +2677,18 @@ def work_pssm_remap(f):
     setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA")
-    rfam_fields_record = work_pydca(f, columns)
+    work_pydca(f, sorted(columns_to_save))
-    data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
+    data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
-    sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
+    sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
                          UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 
-                                    freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U,
+                                    freq_other=excluded.freq_other,
                                     gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
     # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
     sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other,
                           fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
-                          VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
+                          VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
     # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)