Moved to version beta 1.4 by making sina as a command line and adding pydca for feature calculation

Khodor HANNOUSH
Commit 4846d49a05914549aa419feab0cae29cefd7aca6 4846d49a 1 parent e6dff723
Showing 2 changed files with 217 additions and 105 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @4846d49
+++ b/RNAnet.py
View file @4846d49
@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
 from Bio.Seq import MutableSeq
 from Bio.SeqRecord import SeqRecord
 from Bio.Align import MultipleSeqAlignment
- 
+ from pydca.plmdca import plmdca
 runDir = os.getcwd()
 
 def trace_unhandled_exceptions(func):
@@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) 
 validsymb = '\U00002705'
 warnsymb = '\U000026A0'
 errsymb = '\U0000274C'
- 
 LSU_set = {"RF00002", "RF02540", "RF02541",
            "RF02543", "RF02546"}   # From Rfam CLAN 00112
 SSU_set = {"RF00177", "RF02542",  "RF02545",
            "RF01959", "RF01960"}  # From Rfam CLAN 00111
+ 
 no_nts_set = set()
 weird_mappings = set()
 
@@ -960,6 +960,9 @@ class Pipeline:
 
         # Default options:
         self.CRYSTAL_RES = 4.0
+         self.MXSIZE=48000
+         self.TAU=1e-17
+         self.NONBANDED=False
         self.KEEP_HETATM = False
         self.HOMOLOGY = True
         self.USE_KNOWN_ISSUES = True
@@ -967,6 +970,7 @@ class Pipeline:
         self.EXTRACT_CHAINS = False
         self.REUSE_ALL = False
         self.REDUNDANT = False
+         self.USESINA=False
         self.SELECT_ONLY = None
         self.ARCHIVE = False
         self.SAVELOGS = True
@@ -982,7 +986,7 @@ class Pipeline:
         setproctitle("RNANet.py process_options()")
 
         try:
-             opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=",  "only=", "maxcores=",
+             opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=","mxsize=","seq-folder=", "keep-hetatm=","tau=","only=", "maxcores=","sina",
                                                             "from-scratch", "full-inference", "no-homology","redundant", "ignore-issues", "extract", 
                                                             "all", "no-logs", "archive", "update-homologous", "version"])
         except getopt.GetoptError as err:
@@ -1024,9 +1028,13 @@ class Pipeline:
                 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
                       "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
                       "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
+                 print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal,"
+                 "\n\t\t\t\t\t the other RNA families will be aligned using infernal.")
                 print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous"
                       "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions"
                       "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.")
+                 print("--tau=…\t\t\tThe tail loss probability used during HMM band calculation.")
+                 print("--mxsize=…\t\t\tThe maximum allowable total DP matrix size.")
                 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments,"
                       "\n\t\t\t\t and update the link to the latest archive. ")
                 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
@@ -1044,7 +1052,7 @@ class Pipeline:
                 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &")
                 sys.exit()
             elif opt == '--version':
-                 print("RNANet v1.3 beta, parallelized, Dockerized")
+                 print("RNANet v1.4 beta, parallelized, Dockerized")
                 print("Last revision : Jan 2021")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
@@ -1091,6 +1099,11 @@ class Pipeline:
                                 path_to_seq_data + "realigned",
                                 path_to_seq_data + "rfam_sequences"])
                 self.REUSE_ALL = True
+             elif opt=="--tau":
+                 self.TAU=float(arg)
+             elif opt=="--mxsize":
+                 self.MXSIZE=int(arg)
+                 self.NONBANDED=True
             elif opt == "--all":
                 self.REUSE_ALL = True
                 self.USE_KNOWN_ISSUES = False
@@ -1107,6 +1120,8 @@ class Pipeline:
                 self.FULLINFERENCE = True
             elif opt=="--redundant":
                 self.REDUNDANT=True
+             elif opt=="--sina":
+                 self.USESINA=True
 
         if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
             print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
@@ -1731,6 +1746,10 @@ def sql_define_tables(conn):
                 freq_G          REAL,
                 freq_U          REAL,
                 freq_other      REAL,
+                 fields_A        REAL,
+                 fields_C        REAL,
+                 fields_G        REAL,
+                 fields_U        REAL,
                 gap_percent     REAL,
                 consensus       CHAR(1),
                 PRIMARY KEY (rfam_acc, index_ali),
@@ -2187,8 +2206,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
     """
 
     setproctitle("RNAnet.py work_prepare_sequences()")
- 
-     if rfam_acc in LSU_set | SSU_set:  # rRNA
+     if self.USESINA and rfam_acc in LSU_set | SSU_set:
         if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
             # Detect doublons and remove them
             existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
@@ -2270,107 +2288,126 @@ def work_prepare_sequences(dl, rfam_acc, chains):
     notify(status)
 
 @trace_unhandled_exceptions
- def work_realign(rfam_acc):
-     """ Runs multiple sequence alignements by RNA family.
- 
-     It aligns the Rfam hits from a RNA family with the sequences from the list of chains. 
-     Rfam covariance models are used with Infernal tools, except for rRNAs. 
-     cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs.
+ def use_sina(rfam_acc):
     """
+     When prompted by the user to use SINA the software will use SINA for rRNA SSU and LSU
+     """
+     if rfam_acc in ["RF00177", "RF01960"]:
+         arbfile = "realigned/SSU.arb"
+     else:
+         arbfile = "realigned/LSU.arb"
 
-     setproctitle(f"RNAnet.py work_realign({rfam_acc})")
- 
-     if rfam_acc in LSU_set | SSU_set:
-         # Ribosomal subunits deserve a special treatment.
-         # They require too much RAM to be aligned with Infernal.
-         # Then we will use SINA instead.
-         if rfam_acc in ["RF00177", "RF01960"]:
-             arbfile = "realigned/SSU.arb"
-         else:
-             arbfile = "realigned/LSU.arb"
+     # Run alignment
+     p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
+                                         "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
+                                         "-r", path_to_seq_data + arbfile,
+                                        "--meta-fmt=csv"])
+ @trace_unhandled_exceptions
+ def use_infernal(rfam_acc):
+     """
+     Infernal is our default alignment tool except if prompted by the user.
+     Cmalign will be used for all families except when the user prefers to align rRNA with SINA
+     """
+     if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
+         # Alignment exists. We just want to add new sequences into it.
 
-         # Run alignment
-         p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
-                                     "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
-                                     "-r", path_to_seq_data + arbfile,
-                                     "--meta-fmt=csv"])
-     else:
-         # Align using Infernal for most RNA families
+         if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
+             # there are no new sequences to align...
+             return
 
-         if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
-             # Alignment exists. We just want to add new sequences into it.
+         existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
+         new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
+ 
+         # Align the new sequences
+         with open(new_ali_path, 'w') as o:
+             p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm",
+                                 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
+                                 stdout=o, stderr=subprocess.PIPE)
+         notify("Aligned new sequences together")
+ 
+         # Detect doublons and remove them
+         existing_stk = AlignIO.read(existing_ali_path, "stockholm")
+         existing_ids = [r.id for r in existing_stk]
+         del existing_stk
+         new_stk = AlignIO.read(new_ali_path, "stockholm")
+         new_ids = [r.id for r in new_stk]
+         del new_stk
+         doublons = [i for i in existing_ids if i in new_ids]
+         del existing_ids, new_ids
+         if len(doublons):
+             warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
+             with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
+                 toremove.write('\n'.join(doublons)+'\n')
+             p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
+                                 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+             p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
+                                 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+             os.remove(path_to_seq_data + "realigned/toremove.txt")
 
-             if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
-                 # there are no new sequences to align...
-                 return
+         # And we merge the two alignments
+         p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
+                             "--rna", existing_ali_path, new_ali_path],
+                             stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+         stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
+         subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
+         notify("Merged alignments into one")
 
-             existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
-             new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
+         # remove the partial files
+         os.remove(new_ali_path)
+         os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
 
-             # Align the new sequences
-             with open(new_ali_path, 'w') as o:
-                 p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm",
-                                      path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
-                                     stdout=o, stderr=subprocess.PIPE)
-             notify("Aligned new sequences together")
+     else:
+         # Alignment does not exist yet. We need to compute it from scratch.
+         print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
+         
+         #Here the idea is to run one of the two proposed commands either cmalign --tau <tau val> or cmalign with mxsize
+         
+         if not self.NONBANDED:
+             p = subprocess.run(["cmalign", "--tau",f"{self.TAU}",
+                             '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
+                             path_to_seq_data + f"realigned/{rfam_acc}.cm",
+                             path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
+                             stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+         else:
+             p = subprocess.run(["cmalign", "--nonbanded","--noprob","--mxsize",f"{self.MXSIZE}"
+                             '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
+                             path_to_seq_data + f"realigned/{rfam_acc}.cm",
+                             path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
+                             stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+         stderr = p.stderr.decode("utf-8")
+ 
+     if len(stderr):
+         print('', flush=True)
+         warn(f"Error during sequence alignment: {stderr}", error=True)
+         with open(runDir + "/errors.txt", "a") as er:
+             er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
+         return 1
+     else:
+         print('\t'+validsymb, flush=True)
 
-             # Detect doublons and remove them
-             existing_stk = AlignIO.read(existing_ali_path, "stockholm")
-             existing_ids = [r.id for r in existing_stk]
-             del existing_stk
-             new_stk = AlignIO.read(new_ali_path, "stockholm")
-             new_ids = [r.id for r in new_stk]
-             del new_stk
-             doublons = [i for i in existing_ids if i in new_ids]
-             del existing_ids, new_ids
-             if len(doublons):
-                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
-                 with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
-                     toremove.write('\n'.join(doublons)+'\n')
-                 p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
-                                    stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-                 p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
-                                    stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-                 os.remove(path_to_seq_data + "realigned/toremove.txt")
- 
-             # And we merge the two alignments
-             p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
-                                  "--rna", existing_ali_path, new_ali_path],
-                                 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-             stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
-             subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
-             notify("Merged alignments into one")
+ @trace_unhandled_exceptions
+ def work_realign(rfam_acc):
+     """ Runs multiple sequence alignements by RNA family.
 
-             # remove the partial files
-             os.remove(new_ali_path)
-             os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
+     It aligns the Rfam hits from a RNA family with the sequences from the list of chains. 
+     Rfam covariance models are used with Infernal tools or SINA based on the options provided by the user. 
+     Even if the user prefers to use SINA it will be used only for rRNA and cmalign for other families
+     """
 
+     setproctitle(f"RNAnet.py work_realign({rfam_acc})")
+     if self.USESINA:
+         if rfam_acc in LSU_set | SSU_set:
+             use_sina(rfam_acc)
         else:
-             # Alignment does not exist yet. We need to compute it from scratch.
-             print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
- 
-             p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc",
-                                 '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
-                                 path_to_seq_data + f"realigned/{rfam_acc}.cm",
-                                 path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
-                                stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-             stderr = p.stderr.decode("utf-8")
- 
-         if len(stderr):
-             print('', flush=True)
-             warn(f"Error during sequence alignment: {stderr}", error=True)
-             with open(runDir + "/errors.txt", "a") as er:
-                 er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
-             return 1
-         else:
-             print('\t'+validsymb, flush=True)
- 
-         # Convert Stockholm to aligned FASTA
-         subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 
+             use_infernal(rfam_acc)
+     else:
+         use_infernal(rfam_acc)
+     # Convert Stockholm to aligned FASTA
+     subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 
                         "--informat", "stockholm", 
                         "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
-         subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
- 
+     subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
+     
     # Assert everything worked, or save an error
     with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output:
         if not len(output.readline()):
@@ -2380,6 +2417,81 @@ def work_realign(rfam_acc):
                 er.write(f"Failed to realign {rfam_acc} (killed)")
 
 @trace_unhandled_exceptions
+ def compute_from_pydca(f,columns_to_save):
+     
+     align=read(path_to_seq_data + f"realigned/{f}++.afa")
+     #convert to uppercase as needed for pydca
+     for s in align:
+         s.seq=s.seq.upper()
+     filtered_alignment = align[:, 1:1] # all the lines, but no columns
+     for p in columns_to_save: 
+         filtered_alignment += align[:, p-1:p] # save columns one by one
+     
+     #replace all other letters by a gap consensus just for the 
+     #aim to use pydca as sites other than ACGU . and - are not accepted
+     
+ 
+     for s in filtered_alignment:
+         for i in range(len(s.seq)):
+             if s.seq[i].upper() not in "ACGU-.":
+                 s.seq[i]='-'
+     #create a fasta file to be used by pydca 
+     with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d:
+         try:
+             only_3d.write(format(filtered_alignment, "fasta"))
+         except ValueError as e:
+             warn(e)
+     
+     #pydca instance 
+     #here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequence
+     #the maximum number of iterations is set to 500 for gradient descent
+     #lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
+ 
+     #Reference:
+     #Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics. 
+     #2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
+ 
+     
+     plmdca_inst = plmdca.PlmDCA(
+     path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",
+     "rna",
+     seqid = 0.8,
+     lambda_h = 1.0,
+     num_threads = 10,
+     max_iterations = 500,)
+     #number of sites=L*(L-1)/2 so L=len(columns_to_Save)
+     number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2
+     
+     #a tuple of two list of tuples
+     #the first list contains the fields of sites (nucleotides)
+     #the second contains
+     #linear distance is zero in order to keep all possible pairs 
+ 
+     #because if linear dist=x>0 the pydca will return position |i-j|>x
+     #which will force us to lose a lot of pairs
+     
+ 
+     params=plmdca_inst.compute_params(linear_dist=0,num_site_pairs=number_of_sites)
+     #frobenius norm with average product correction
+     fn_apc=plmdca_inst.compute_sorted_FN_APC()
+ 
+     family_dca_data={"PARAMS":params,"FNAPC":fn_apc}
+     np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz")
+ 
+     #a dictionary to be used in the function where the frequencies are stored in align column table
+     return_dict_fields={}
+     for list_fields in params[0]:
+         
+         #The element at 0 is the index 
+         #So taking the value from column to save at that index will give us 
+         #the fields to be stored at ali_col in the table
+         
+         return_dict_fields[columns_to_save[list_fields[0]]]=list_fields[1]
+     subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
+ 
+     return return_dict_fields
+ 
+ @trace_unhandled_exceptions
 def work_pssm_remap(f):
     """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
     This also remaps the 3D object sequence with the aligned sequence in the MSA.
@@ -2560,16 +2672,16 @@ def work_pssm_remap(f):
             unused.append((f, col))
     sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
     conn.commit()
- 
+     rfam_fields_record=compute_from_pydca(f,sorted(columns_to_save))
     # Save the useful columns in the database
-     data = [(f, j) + tuple(pssm_info[:,j-1]) + (consensus[j-1],) for j in sorted(columns_to_save)]
-     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus)
-                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
+     data = [(f, j) + tuple(pssm_info[:,j-1]) +tuple(rfam_fields_record[j]) + (consensus[j-1],) for j in sorted(columns_to_save)]
+     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U,freq_other,fields_A,fields_C,fields_G,fields_U , gap_percent, consensus)
+                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?,?,?,?,?) ON CONFLICT(rfam_acc, index_ali) DO 
                          UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 
-                                     freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data)
+                                     freq_other=excluded.freq_other,fields_A=excluded.fields_A,fields_C=excluded.fields_C,fields_G=excluded.fields_G,fields_U=excluded.fields_U,gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data)
     # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
-     sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus)
-                           VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-');""", data=(f,))
+     sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other,fields_A,fields_C,fields_G,fields_U, gap_percent, consensus)
+                           VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0,0.0,1.0,'-');""", data=(f,))
     # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
     sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
     conn.close()
@@ -2613,7 +2725,7 @@ def work_save(c, homology=True):
     if homology:
         df = pd.read_sql_query(f"""
                 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, 
-                 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, dbn,
+                 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other,fields_A,fields_C,fields_G,fields_U,gap_percent, consensus, dbn,
                 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
                 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
                 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM 
--- a/statistics.py
View file @4846d49
+++ b/statistics.py
View file @4846d49
@@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
             else:
                 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
     
-     if f not in LSU_set and f not in SSU_set:
-         np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
+     # if f not in LSU_set and f not in SSU_set:
+     np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
     return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d)
 
 @trace_unhandled_exceptions
@@ -1173,7 +1173,7 @@ if __name__ == "__main__":
 
             sys.exit()
         elif opt == '--version':
-             print("RNANet statistics 1.3 beta")
+             print("RNANet statistics 1.4 beta")
             sys.exit()
         elif opt == "-r" or opt == "--resolution":
             assert float(arg) > 0.0 and float(arg) <= 20.0