Merged Khodor-internship to master + general cmalign options
Showing
2 changed files
with
253 additions
and
140 deletions
... | @@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser | ... | @@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser |
40 | from Bio.Seq import MutableSeq | 40 | from Bio.Seq import MutableSeq |
41 | from Bio.SeqRecord import SeqRecord | 41 | from Bio.SeqRecord import SeqRecord |
42 | from Bio.Align import MultipleSeqAlignment | 42 | from Bio.Align import MultipleSeqAlignment |
43 | - | 43 | +from pydca.plmdca import plmdca |
44 | runDir = os.getcwd() | 44 | runDir = os.getcwd() |
45 | 45 | ||
46 | def trace_unhandled_exceptions(func): | 46 | def trace_unhandled_exceptions(func): |
... | @@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) | ... | @@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) |
76 | validsymb = '\U00002705' | 76 | validsymb = '\U00002705' |
77 | warnsymb = '\U000026A0' | 77 | warnsymb = '\U000026A0' |
78 | errsymb = '\U0000274C' | 78 | errsymb = '\U0000274C' |
79 | - | ||
80 | LSU_set = {"RF00002", "RF02540", "RF02541", | 79 | LSU_set = {"RF00002", "RF02540", "RF02541", |
81 | "RF02543", "RF02546"} # From Rfam CLAN 00112 | 80 | "RF02543", "RF02546"} # From Rfam CLAN 00112 |
82 | SSU_set = {"RF00177", "RF02542", "RF02545", | 81 | SSU_set = {"RF00177", "RF02542", "RF02545", |
83 | "RF01959", "RF01960"} # From Rfam CLAN 00111 | 82 | "RF01959", "RF01960"} # From Rfam CLAN 00111 |
83 | + | ||
84 | no_nts_set = set() | 84 | no_nts_set = set() |
85 | weird_mappings = set() | 85 | weird_mappings = set() |
86 | 86 | ||
... | @@ -845,14 +845,14 @@ class Downloader: | ... | @@ -845,14 +845,14 @@ class Downloader: |
845 | if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): | 845 | if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): |
846 | print("\t> Use of the previous version.\t", end="", flush=True) | 846 | print("\t> Use of the previous version.\t", end="", flush=True) |
847 | else: | 847 | else: |
848 | - return pd.DataFrame([], columns=["class", "class_members"]) | 848 | + return pd.DataFrame([], columns=["class","representative","class_members"]) |
849 | 849 | ||
850 | nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") | 850 | nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") |
851 | - full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ] | 851 | + full_structures_list = [ tuple(i[1]) for i in nrlist[["class","representative","class_members"]].iterrows() ] |
852 | print(f"\t{validsymb}", flush=True) | 852 | print(f"\t{validsymb}", flush=True) |
853 | 853 | ||
854 | # The beginning of an adventure. | 854 | # The beginning of an adventure. |
855 | - return full_structures_list # list of ( str (class), str (class_members) ) | 855 | + return full_structures_list # list of ( str (class), str(representative),str (class_members) ) |
856 | 856 | ||
857 | def download_from_SILVA(self, unit): | 857 | def download_from_SILVA(self, unit): |
858 | 858 | ||
... | @@ -966,6 +966,9 @@ class Pipeline: | ... | @@ -966,6 +966,9 @@ class Pipeline: |
966 | self.RUN_STATS = False | 966 | self.RUN_STATS = False |
967 | self.EXTRACT_CHAINS = False | 967 | self.EXTRACT_CHAINS = False |
968 | self.REUSE_ALL = False | 968 | self.REUSE_ALL = False |
969 | + self.REDUNDANT = False | ||
970 | + self.ALIGNOPTS = None | ||
971 | + self.USESINA = False | ||
969 | self.SELECT_ONLY = None | 972 | self.SELECT_ONLY = None |
970 | self.ARCHIVE = False | 973 | self.ARCHIVE = False |
971 | self.SAVELOGS = True | 974 | self.SAVELOGS = True |
... | @@ -981,8 +984,9 @@ class Pipeline: | ... | @@ -981,8 +984,9 @@ class Pipeline: |
981 | setproctitle("RNANet.py process_options()") | 984 | setproctitle("RNANet.py process_options()") |
982 | 985 | ||
983 | try: | 986 | try: |
984 | - opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=", "maxcores=", | 987 | + opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", |
985 | - "from-scratch", "full-inference", "no-homology", "ignore-issues", "extract", | 988 | + "only=", "cmalign-opts=", "maxcores=", "sina", "from-scratch", |
989 | + "full-inference", "no-homology", "redundant", "ignore-issues", "extract", | ||
986 | "all", "no-logs", "archive", "update-homologous", "version"]) | 990 | "all", "no-logs", "archive", "update-homologous", "version"]) |
987 | except getopt.GetoptError as err: | 991 | except getopt.GetoptError as err: |
988 | print(err) | 992 | print(err) |
... | @@ -1006,6 +1010,7 @@ class Pipeline: | ... | @@ -1006,6 +1010,7 @@ class Pipeline: |
1006 | print("--------------------------------------------------------------------------------------------------------------") | 1010 | print("--------------------------------------------------------------------------------------------------------------") |
1007 | print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of" | 1011 | print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of" |
1008 | "\n\t\t\t\t chains mapped to different families.") | 1012 | "\n\t\t\t\t chains mapped to different families.") |
1013 | + print("--redundant\t\t\t\tStore the class members in the database thoughts to be redundant for predictions.") | ||
1009 | print("-s\t\t\t\tRun statistics computations after completion") | 1014 | print("-s\t\t\t\tRun statistics computations after completion") |
1010 | print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") | 1015 | print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") |
1011 | print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " | 1016 | print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " |
... | @@ -1022,9 +1027,12 @@ class Pipeline: | ... | @@ -1022,9 +1027,12 @@ class Pipeline: |
1022 | print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" | 1027 | print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" |
1023 | "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" | 1028 | "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" |
1024 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") | 1029 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") |
1030 | + print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal," | ||
1031 | + "\n\t\t\t\t\t the other RNA families will be aligned using infernal.") | ||
1025 | print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous" | 1032 | print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous" |
1026 | "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" | 1033 | "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" |
1027 | "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") | 1034 | "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") |
1035 | + print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"") | ||
1028 | print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," | 1036 | print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," |
1029 | "\n\t\t\t\t and update the link to the latest archive. ") | 1037 | "\n\t\t\t\t and update the link to the latest archive. ") |
1030 | print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") | 1038 | print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") |
... | @@ -1042,8 +1050,8 @@ class Pipeline: | ... | @@ -1042,8 +1050,8 @@ class Pipeline: |
1042 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &") | 1050 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &") |
1043 | sys.exit() | 1051 | sys.exit() |
1044 | elif opt == '--version': | 1052 | elif opt == '--version': |
1045 | - print("RNANet v1.3 beta, parallelized, Dockerized") | 1053 | + print("RNANet v1.4 beta, parallelized, Dockerized") |
1046 | - print("Last revision : Jan 2021") | 1054 | + print("Last revision : April 2021") |
1047 | sys.exit() | 1055 | sys.exit() |
1048 | elif opt == "-r" or opt == "--resolution": | 1056 | elif opt == "-r" or opt == "--resolution": |
1049 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1057 | assert float(arg) > 0.0 and float(arg) <= 20.0 |
... | @@ -1089,6 +1097,8 @@ class Pipeline: | ... | @@ -1089,6 +1097,8 @@ class Pipeline: |
1089 | path_to_seq_data + "realigned", | 1097 | path_to_seq_data + "realigned", |
1090 | path_to_seq_data + "rfam_sequences"]) | 1098 | path_to_seq_data + "rfam_sequences"]) |
1091 | self.REUSE_ALL = True | 1099 | self.REUSE_ALL = True |
1100 | + elif opt == "cmalign-opts": | ||
1101 | + self.ALIGNOPTS = arg | ||
1092 | elif opt == "--all": | 1102 | elif opt == "--all": |
1093 | self.REUSE_ALL = True | 1103 | self.REUSE_ALL = True |
1094 | self.USE_KNOWN_ISSUES = False | 1104 | self.USE_KNOWN_ISSUES = False |
... | @@ -1103,6 +1113,10 @@ class Pipeline: | ... | @@ -1103,6 +1113,10 @@ class Pipeline: |
1103 | ncores = min(ncores, int(arg)) | 1113 | ncores = min(ncores, int(arg)) |
1104 | elif opt == "-f" or opt == "--full-inference": | 1114 | elif opt == "-f" or opt == "--full-inference": |
1105 | self.FULLINFERENCE = True | 1115 | self.FULLINFERENCE = True |
1116 | + elif opt=="--redundant": | ||
1117 | + self.REDUNDANT=True | ||
1118 | + elif opt=="--sina": | ||
1119 | + self.USESINA=True | ||
1106 | 1120 | ||
1107 | if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": | 1121 | if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": |
1108 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") | 1122 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") |
... | @@ -1151,7 +1165,8 @@ class Pipeline: | ... | @@ -1151,7 +1165,8 @@ class Pipeline: |
1151 | work_infer_mappings, | 1165 | work_infer_mappings, |
1152 | not self.REUSE_ALL, | 1166 | not self.REUSE_ALL, |
1153 | allmappings, | 1167 | allmappings, |
1154 | - self.FULLINFERENCE | 1168 | + self.FULLINFERENCE, |
1169 | + self.REDUNDANT | ||
1155 | ), | 1170 | ), |
1156 | full_structures_list, | 1171 | full_structures_list, |
1157 | chunksize=1)): | 1172 | chunksize=1)): |
... | @@ -1340,7 +1355,7 @@ class Pipeline: | ... | @@ -1340,7 +1355,7 @@ class Pipeline: |
1340 | joblist = [] | 1355 | joblist = [] |
1341 | for f in self.fam_list: | 1356 | for f in self.fam_list: |
1342 | joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[ | 1357 | joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[ |
1343 | - self.dl, f, rfam_acc_to_download[f]])) | 1358 | + self.dl, self.USESINA, f, rfam_acc_to_download[f]])) |
1344 | try: | 1359 | try: |
1345 | execute_joblist(joblist) | 1360 | execute_joblist(joblist) |
1346 | 1361 | ||
... | @@ -1365,7 +1380,7 @@ class Pipeline: | ... | @@ -1365,7 +1380,7 @@ class Pipeline: |
1365 | joblist = [] | 1380 | joblist = [] |
1366 | for f in self.fam_list: | 1381 | for f in self.fam_list: |
1367 | # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1) | 1382 | # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1) |
1368 | - joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f)) | 1383 | + joblist.append(Job(function=work_realign, args=[self.USESINA, self.ALIGNOPTS, f], how_many_in_parallel=1, label=f)) |
1369 | 1384 | ||
1370 | # Execute the jobs | 1385 | # Execute the jobs |
1371 | try: | 1386 | try: |
... | @@ -1753,6 +1768,10 @@ def sql_define_tables(conn): | ... | @@ -1753,6 +1768,10 @@ def sql_define_tables(conn): |
1753 | freq_G REAL, | 1768 | freq_G REAL, |
1754 | freq_U REAL, | 1769 | freq_U REAL, |
1755 | freq_other REAL, | 1770 | freq_other REAL, |
1771 | + fields_A REAL, | ||
1772 | + fields_C REAL, | ||
1773 | + fields_G REAL, | ||
1774 | + fields_U REAL, | ||
1756 | gap_percent REAL, | 1775 | gap_percent REAL, |
1757 | consensus CHAR(1), | 1776 | consensus CHAR(1), |
1758 | cons_sec_struct CHAR(1), | 1777 | cons_sec_struct CHAR(1), |
... | @@ -1937,7 +1956,7 @@ def execute_joblist(fulljoblist): | ... | @@ -1937,7 +1956,7 @@ def execute_joblist(fulljoblist): |
1937 | return results | 1956 | return results |
1938 | 1957 | ||
1939 | @trace_unhandled_exceptions | 1958 | @trace_unhandled_exceptions |
1940 | -def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list: | 1959 | +def work_infer_mappings(update_only, allmappings, fullinference,redundant, codelist) -> list: |
1941 | """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, | 1960 | """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, |
1942 | build a list of Chain() objects mapped to Rfam families, by expanding available mappings | 1961 | build a list of Chain() objects mapped to Rfam families, by expanding available mappings |
1943 | of any element of the list to all the list elements. | 1962 | of any element of the list to all the list elements. |
... | @@ -1950,8 +1969,8 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li | ... | @@ -1950,8 +1969,8 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li |
1950 | 1969 | ||
1951 | # Split the comma-separated list of chain codes into chain codes: | 1970 | # Split the comma-separated list of chain codes into chain codes: |
1952 | eq_class = codelist[0] | 1971 | eq_class = codelist[0] |
1953 | - codes = codelist[1].replace('+', ',').split(',') | 1972 | + codes = codelist[2].replace('+', ',').split(',') |
1954 | - | 1973 | + representative=codelist[1].replace('+', ',').split(',')[0] |
1955 | # Search for mappings that apply to an element of this PDB chains list: | 1974 | # Search for mappings that apply to an element of this PDB chains list: |
1956 | for c in codes: | 1975 | for c in codes: |
1957 | # search for Rfam mappings with this chain c: | 1976 | # search for Rfam mappings with this chain c: |
... | @@ -2040,6 +2059,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li | ... | @@ -2040,6 +2059,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li |
2040 | 2059 | ||
2041 | # Now build Chain() objects for the mapped chains | 2060 | # Now build Chain() objects for the mapped chains |
2042 | for c in codes: | 2061 | for c in codes: |
2062 | + | ||
2063 | + if not redundant and c!=representative: | ||
2064 | + ''' | ||
2065 | + by default save only the representative member | ||
2066 | + if redundant is passed then save all the chains of the class members | ||
2067 | + ''' | ||
2068 | + continue | ||
2043 | nr = c.split('|') | 2069 | nr = c.split('|') |
2044 | pdb_id = nr[0].lower() | 2070 | pdb_id = nr[0].lower() |
2045 | pdb_model = int(nr[1]) | 2071 | pdb_model = int(nr[1]) |
... | @@ -2202,13 +2228,12 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | ... | @@ -2202,13 +2228,12 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
2202 | return c | 2228 | return c |
2203 | 2229 | ||
2204 | @trace_unhandled_exceptions | 2230 | @trace_unhandled_exceptions |
2205 | -def work_prepare_sequences(dl, rfam_acc, chains): | 2231 | +def work_prepare_sequences(dl, useSina, rfam_acc, chains): |
2206 | """Prepares FASTA files of homologous sequences to realign with cmalign or SINA. | 2232 | """Prepares FASTA files of homologous sequences to realign with cmalign or SINA. |
2207 | """ | 2233 | """ |
2208 | 2234 | ||
2209 | setproctitle("RNAnet.py work_prepare_sequences()") | 2235 | setproctitle("RNAnet.py work_prepare_sequences()") |
2210 | - | 2236 | + if useSina and rfam_acc in LSU_set | SSU_set: |
2211 | - if rfam_acc in LSU_set | SSU_set: # rRNA | ||
2212 | if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): | 2237 | if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): |
2213 | # Detect doublons and remove them | 2238 | # Detect doublons and remove them |
2214 | existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") | 2239 | existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") |
... | @@ -2290,109 +2315,122 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2290,109 +2315,122 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2290 | notify(status) | 2315 | notify(status) |
2291 | 2316 | ||
2292 | @trace_unhandled_exceptions | 2317 | @trace_unhandled_exceptions |
2293 | -def work_realign(rfam_acc): | 2318 | +def use_sina(rfam_acc): |
2294 | - """ Runs multiple sequence alignements by RNA family. | ||
2295 | - | ||
2296 | - It aligns the Rfam hits from a RNA family with the sequences from the list of chains. | ||
2297 | - Rfam covariance models are used with Infernal tools, except for rRNAs. | ||
2298 | - cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs. | ||
2299 | """ | 2319 | """ |
2300 | - | 2320 | + When prompted by the user to use SINA the software will use SINA for rRNA SSU and LSU |
2301 | - setproctitle(f"RNAnet.py work_realign({rfam_acc})") | 2321 | + """ |
2302 | - | 2322 | + if rfam_acc in ["RF00177", "RF01960"]: |
2303 | - if rfam_acc in LSU_set | SSU_set: | 2323 | + arbfile = "realigned/SSU.arb" |
2304 | - # Ribosomal subunits deserve a special treatment. | ||
2305 | - # They require too much RAM to be aligned with Infernal. | ||
2306 | - # Then we will use SINA instead. | ||
2307 | - if rfam_acc in ["RF00177", "RF01960"]: | ||
2308 | - arbfile = "realigned/SSU.arb" | ||
2309 | - else: | ||
2310 | - arbfile = "realigned/LSU.arb" | ||
2311 | - | ||
2312 | - # Run alignment | ||
2313 | - p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa", | ||
2314 | - "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", | ||
2315 | - "-r", path_to_seq_data + arbfile, | ||
2316 | - "--meta-fmt=csv"]) | ||
2317 | else: | 2324 | else: |
2318 | - # Align using Infernal for most RNA families | 2325 | + arbfile = "realigned/LSU.arb" |
2319 | - | ||
2320 | - if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"): | ||
2321 | - # Alignment exists. We just want to add new sequences into it. | ||
2322 | - | ||
2323 | - if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"): | ||
2324 | - # there are no new sequences to align... | ||
2325 | - return | ||
2326 | 2326 | ||
2327 | - existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk" | 2327 | + # Run alignment |
2328 | - new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk" | 2328 | + subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa", |
2329 | + "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", | ||
2330 | + "-r", path_to_seq_data + arbfile, | ||
2331 | + "--meta-fmt=csv"]) | ||
2329 | 2332 | ||
2330 | - # Align the new sequences | 2333 | +@trace_unhandled_exceptions |
2331 | - with open(new_ali_path, 'w') as o: | 2334 | +def use_infernal(rfam_acc, alignopts): |
2332 | - p1 = subprocess.run(["cmalign", "--nonbanded", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", | 2335 | + """ |
2333 | - "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", | 2336 | + Infernal is our default alignment tool except if prompted by the user. |
2334 | - path_to_seq_data + f"realigned/{rfam_acc}.cm", | 2337 | + Cmalign will be used for all families except when the user prefers to align rRNA with SINA |
2335 | - path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], | 2338 | + """ |
2336 | - stdout=o, stderr=subprocess.PIPE) | 2339 | + if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"): |
2337 | - notify("Aligned new sequences together") | 2340 | + # Alignment exists. We just want to add new sequences into it. |
2338 | - | ||
2339 | - # Detect doublons and remove them | ||
2340 | - existing_stk = AlignIO.read(existing_ali_path, "stockholm") | ||
2341 | - existing_ids = [r.id for r in existing_stk] | ||
2342 | - del existing_stk | ||
2343 | - new_stk = AlignIO.read(new_ali_path, "stockholm") | ||
2344 | - new_ids = [r.id for r in new_stk] | ||
2345 | - del new_stk | ||
2346 | - doublons = [i for i in existing_ids if i in new_ids] | ||
2347 | - del existing_ids, new_ids | ||
2348 | - if len(doublons): | ||
2349 | - warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") | ||
2350 | - with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove: | ||
2351 | - toremove.write('\n'.join(doublons)+'\n') | ||
2352 | - p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], | ||
2353 | - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2354 | - p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], | ||
2355 | - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2356 | - os.remove(path_to_seq_data + "realigned/toremove.txt") | ||
2357 | - | ||
2358 | - # And we merge the two alignments | ||
2359 | - p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", | ||
2360 | - "--rna", existing_ali_path, new_ali_path], | ||
2361 | - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2362 | - stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') | ||
2363 | - subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) | ||
2364 | - notify("Merged alignments into one") | ||
2365 | 2341 | ||
2366 | - # remove the partial files | 2342 | + if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"): |
2367 | - os.remove(new_ali_path) | 2343 | + # there are no new sequences to align... |
2368 | - os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") | 2344 | + return |
2369 | 2345 | ||
2370 | - else: | 2346 | + existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk" |
2371 | - # Alignment does not exist yet. We need to compute it from scratch. | 2347 | + new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk" |
2372 | - print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True) | ||
2373 | 2348 | ||
2374 | - p = subprocess.run(["cmalign", "--nonbanded", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", | 2349 | + # Align the new sequences |
2375 | - "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", | 2350 | + with open(new_ali_path, 'w') as o: |
2351 | + p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", | ||
2352 | + "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", | ||
2376 | path_to_seq_data + f"realigned/{rfam_acc}.cm", | 2353 | path_to_seq_data + f"realigned/{rfam_acc}.cm", |
2377 | - path_to_seq_data + f"realigned/{rfam_acc}++.fa"], | 2354 | + path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], |
2378 | - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | 2355 | + stdout=o, stderr=subprocess.PIPE) |
2379 | - stderr = p.stderr.decode("utf-8") | 2356 | + notify("Aligned new sequences together") |
2380 | - | 2357 | + |
2381 | - if len(stderr): | 2358 | + # Detect doublons and remove them |
2382 | - print('', flush=True) | 2359 | + existing_stk = AlignIO.read(existing_ali_path, "stockholm") |
2383 | - warn(f"Error during sequence alignment: {stderr}", error=True) | 2360 | + existing_ids = [r.id for r in existing_stk] |
2384 | - with open(runDir + "/errors.txt", "a") as er: | 2361 | + del existing_stk |
2385 | - er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n') | 2362 | + new_stk = AlignIO.read(new_ali_path, "stockholm") |
2386 | - return 1 | 2363 | + new_ids = [r.id for r in new_stk] |
2387 | - else: | 2364 | + del new_stk |
2388 | - print('\t'+validsymb, flush=True) | 2365 | + doublons = [i for i in existing_ids if i in new_ids] |
2366 | + del existing_ids, new_ids | ||
2367 | + if len(doublons): | ||
2368 | + warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") | ||
2369 | + with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove: | ||
2370 | + toremove.write('\n'.join(doublons)+'\n') | ||
2371 | + p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], | ||
2372 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2373 | + p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], | ||
2374 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2375 | + os.remove(path_to_seq_data + "realigned/toremove.txt") | ||
2376 | + | ||
2377 | + # And we merge the two alignments | ||
2378 | + p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", | ||
2379 | + "--rna", existing_ali_path, new_ali_path], | ||
2380 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2381 | + stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') | ||
2382 | + subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) | ||
2383 | + notify("Merged alignments into one") | ||
2384 | + | ||
2385 | + # remove the partial files | ||
2386 | + os.remove(new_ali_path) | ||
2387 | + os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") | ||
2388 | + else: | ||
2389 | + # Alignment does not exist yet. We need to compute it from scratch. | ||
2390 | + print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True) | ||
2391 | + | ||
2392 | + cmd = ["cmalign"] | ||
2393 | + if alignopts is not None: | ||
2394 | + cmd += " ".split(alignopts) | ||
2395 | + cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", | ||
2396 | + "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", | ||
2397 | + "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", | ||
2398 | + path_to_seq_data + f"realigned/{rfam_acc}.cm", | ||
2399 | + path_to_seq_data + f"realigned/{rfam_acc}++.fa"] | ||
2400 | + | ||
2401 | + p = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2402 | + stderr = p.stderr.decode("utf-8") | ||
2403 | + | ||
2404 | + if len(stderr): | ||
2405 | + print('', flush=True) | ||
2406 | + warn(f"Error during sequence alignment: {stderr}", error=True) | ||
2407 | + with open(runDir + "/errors.txt", "a") as er: | ||
2408 | + er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n') | ||
2409 | + return 1 | ||
2410 | + else: | ||
2411 | + print('\t'+validsymb, flush=True) | ||
2389 | 2412 | ||
2390 | - # Convert Stockholm to aligned FASTA | 2413 | + # Convert Stockholm to aligned FASTA |
2391 | - subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", | 2414 | + subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", |
2392 | "--informat", "stockholm", | 2415 | "--informat", "stockholm", |
2393 | "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) | 2416 | "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) |
2394 | - subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. | 2417 | + subprocess.run(["rm", "-f", "esltmp*"]) # We can use a joker here, because we are not running in parallel for this part. |
2418 | + | ||
2419 | +@trace_unhandled_exceptions | ||
2420 | +def work_realign(useSina, alignopts, rfam_acc): | ||
2421 | + """ Runs multiple sequence alignements by RNA family. | ||
2395 | 2422 | ||
2423 | + It aligns the Rfam hits from a RNA family with the sequences from the list of chains. | ||
2424 | + Rfam covariance models are used with Infernal tools or SINA based on the options provided by the user. | ||
2425 | + Even if the user prefers to use SINA it will be used only for rRNA and cmalign for other families | ||
2426 | + """ | ||
2427 | + | ||
2428 | + setproctitle(f"RNAnet.py work_realign({rfam_acc})") | ||
2429 | + if useSina and rfam_acc in LSU_set | SSU_set: | ||
2430 | + use_sina(rfam_acc) | ||
2431 | + else: | ||
2432 | + use_infernal(rfam_acc, alignopts) | ||
2433 | + | ||
2396 | # Assert everything worked, or save an error | 2434 | # Assert everything worked, or save an error |
2397 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output: | 2435 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output: |
2398 | if not len(output.readline()): | 2436 | if not len(output.readline()): |
... | @@ -2402,6 +2440,73 @@ def work_realign(rfam_acc): | ... | @@ -2402,6 +2440,73 @@ def work_realign(rfam_acc): |
2402 | er.write(f"Failed to realign {rfam_acc} (killed)") | 2440 | er.write(f"Failed to realign {rfam_acc} (killed)") |
2403 | 2441 | ||
2404 | @trace_unhandled_exceptions | 2442 | @trace_unhandled_exceptions |
2443 | +def work_pydca(f, columns_to_save): | ||
2444 | + """ | ||
2445 | + This function writes an alignment file containing only the columns which will be saved to the database, | ||
2446 | + converted to uppercase, and without non-ACGU nucleotides. | ||
2447 | + This file in then used by pydca to compute DCA features, and finally removed. | ||
2448 | + """ | ||
2449 | + | ||
2450 | + align=read(path_to_seq_data + f"realigned/{f}++.afa") | ||
2451 | + for s in align: | ||
2452 | + s.seq=s.seq.upper() # Convert to uppercase as needed for pydca | ||
2453 | + filtered_alignment = align[:, 1:1] # all the lines, but no columns | ||
2454 | + for p in columns_to_save: | ||
2455 | + filtered_alignment += align[:, p-1:p] # save columns one by one | ||
2456 | + | ||
2457 | + # Replace all other letters by a deletion gap just for the | ||
2458 | + # aim to use pydca as sites other than ACGU . and - are not accepted | ||
2459 | + for s in filtered_alignment: | ||
2460 | + for i in range(len(s.seq)): | ||
2461 | + if s.seq[i].upper() not in "ACGU-.": | ||
2462 | + s.seq[i]='-' | ||
2463 | + | ||
2464 | + # Create a fasta file to be used by pydca | ||
2465 | + with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d: | ||
2466 | + try: | ||
2467 | + only_3d.write(format(filtered_alignment, "fasta")) | ||
2468 | + except ValueError as e: | ||
2469 | + warn(e) | ||
2470 | + | ||
2471 | + # PyDCA instance with options, | ||
2472 | + # Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences | ||
2473 | + # The maximum number of iterations is set to 500 for gradient descent | ||
2474 | + # Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers | ||
2475 | + # Reference: | ||
2476 | + # Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics. | ||
2477 | + # 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/ | ||
2478 | + plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", | ||
2479 | + "rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500) | ||
2480 | + number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2 # L*(L-1)/2 where L=len(columns_to_save) | ||
2481 | + | ||
2482 | + # Tuple of two list of tuples | ||
2483 | + # - the first list contains the fields of sites (nucleotides) | ||
2484 | + # - the second contains pairwise fields (2 nucleotides) | ||
2485 | + # linear distance is zero in order to keep all possible pairs | ||
2486 | + # because if linear dist=x>0 the pydca will return position |i-j|>x | ||
2487 | + # which will force us to lose a lot of pairs | ||
2488 | + params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites) | ||
2489 | + | ||
2490 | + # Fröbenius norm with average product correction | ||
2491 | + fn_apc = plmdca_inst.compute_sorted_FN_APC() | ||
2492 | + | ||
2493 | + # Save to file | ||
2494 | + np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc) | ||
2495 | + | ||
2496 | + # A dictionary to be used in the function where the frequencies are stored in align_column table | ||
2497 | + return_dict_fields={} | ||
2498 | + for list_fields in params[0]: | ||
2499 | + # The element at 0 is the index | ||
2500 | + # So taking the value from column to save at that index will give us | ||
2501 | + # the fields to be stored at align_column in the table | ||
2502 | + return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1] | ||
2503 | + | ||
2504 | + # Cleanup | ||
2505 | + subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"]) | ||
2506 | + | ||
2507 | + return return_dict_fields | ||
2508 | + | ||
2509 | +@trace_unhandled_exceptions | ||
2405 | def work_pssm_remap(f): | 2510 | def work_pssm_remap(f): |
2406 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. | 2511 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. |
2407 | This also remaps the 3D object sequence with the aligned sequence in the MSA. | 2512 | This also remaps the 3D object sequence with the aligned sequence in the MSA. |
... | @@ -2477,7 +2582,6 @@ def work_pssm_remap(f): | ... | @@ -2477,7 +2582,6 @@ def work_pssm_remap(f): |
2477 | 2582 | ||
2478 | # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars. | 2583 | # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars. |
2479 | 2584 | ||
2480 | - | ||
2481 | ########################################################################################## | 2585 | ########################################################################################## |
2482 | # Remap sequences of the 3D chains with sequences in the alignment | 2586 | # Remap sequences of the 3D chains with sequences in the alignment |
2483 | ########################################################################################## | 2587 | ########################################################################################## |
... | @@ -2560,11 +2664,29 @@ def work_pssm_remap(f): | ... | @@ -2560,11 +2664,29 @@ def work_pssm_remap(f): |
2560 | # Get a sorted list from the set | 2664 | # Get a sorted list from the set |
2561 | columns = sorted(columns_to_save) | 2665 | columns = sorted(columns_to_save) |
2562 | 2666 | ||
2667 | + # Save the re_mappings | ||
2668 | + conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) | ||
2669 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
2670 | + sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) | ||
2671 | + VALUES (?, ?, ?) | ||
2672 | + ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""", | ||
2673 | + many=True, data=re_mappings) | ||
2674 | + | ||
2675 | + # Delete alignment columns that are not used anymore from the database | ||
2676 | + current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")] | ||
2677 | + unused = [] | ||
2678 | + for col in current_family_columns: | ||
2679 | + if col not in columns_to_save: | ||
2680 | + unused.append((f, col)) | ||
2681 | + sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) | ||
2682 | + conn.commit() | ||
2683 | + | ||
2684 | + | ||
2563 | ########################################################################################## | 2685 | ########################################################################################## |
2564 | - # Save the alignment columns and their mappings to the database | 2686 | + # Retrieve or compute metadata relative to the MSA columns |
2565 | ########################################################################################## | 2687 | ########################################################################################## |
2566 | 2688 | ||
2567 | - setproctitle(f"RNAnet.py work_pssm_remap({f}) saving") | 2689 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states") |
2568 | 2690 | ||
2569 | # Get back the information of match/insertion states from the STK file | 2691 | # Get back the information of match/insertion states from the STK file |
2570 | alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") | 2692 | alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") |
... | @@ -2590,32 +2712,22 @@ def work_pssm_remap(f): | ... | @@ -2590,32 +2712,22 @@ def work_pssm_remap(f): |
2590 | cm_2d.append(".") | 2712 | cm_2d.append(".") |
2591 | cm_coord += 1 | 2713 | cm_coord += 1 |
2592 | 2714 | ||
2593 | - # Save the re_mappings | 2715 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA") |
2594 | - conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) | ||
2595 | - conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
2596 | - sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) | ||
2597 | - VALUES (?, ?, ?) | ||
2598 | - ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""", | ||
2599 | - many=True, data=re_mappings) | ||
2600 | 2716 | ||
2601 | - # Delete alignment columns that are not used anymore from the database | 2717 | + rfam_fields_record = work_pydca(f, columns) |
2602 | - current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")] | ||
2603 | - unused = [] | ||
2604 | - for col in current_family_columns: | ||
2605 | - if col not in columns_to_save: | ||
2606 | - unused.append((f, col)) | ||
2607 | - sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) | ||
2608 | - conn.commit() | ||
2609 | 2718 | ||
2610 | - # Save the useful columns in the database | 2719 | + data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)] |
2611 | - data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)] | 2720 | + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct) |
2612 | - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) | 2721 | + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO |
2613 | - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO | ||
2614 | UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, | 2722 | UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, |
2615 | - freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) | 2723 | + freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U, |
2724 | + gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) | ||
2616 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) | 2725 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) |
2617 | - sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) | 2726 | + sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, |
2618 | - VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', "NULL");""", data=(f,)) | 2727 | + fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct) |
2728 | + VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,)) | ||
2729 | + | ||
2730 | + | ||
2619 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) | 2731 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) |
2620 | sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) | 2732 | sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) |
2621 | conn.close() | 2733 | conn.close() |
... | @@ -2659,9 +2771,10 @@ def work_save(c, homology=True): | ... | @@ -2659,9 +2771,10 @@ def work_save(c, homology=True): |
2659 | if homology: | 2771 | if homology: |
2660 | df = pd.read_sql_query(f""" | 2772 | df = pd.read_sql_query(f""" |
2661 | SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, | 2773 | SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, |
2662 | - is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct, dbn, | 2774 | + is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, |
2663 | - paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, | 2775 | + fields_U, gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, |
2664 | - chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 2776 | + alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, |
2777 | + eta, theta, eta_prime, theta_prime, eta_base, theta_base, | ||
2665 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM | 2778 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM |
2666 | (SELECT chain_id, rfam_acc from chain WHERE chain_id = {c.db_chain_id}) | 2779 | (SELECT chain_id, rfam_acc from chain WHERE chain_id = {c.db_chain_id}) |
2667 | NATURAL JOIN re_mapping | 2780 | NATURAL JOIN re_mapping | ... | ... |
... | @@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): | ... | @@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): |
952 | else: | 952 | else: |
953 | d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) | 953 | d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) |
954 | 954 | ||
955 | - if f not in LSU_set and f not in SSU_set: | 955 | + # if f not in LSU_set and f not in SSU_set: |
956 | - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") | 956 | + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") |
957 | return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d) | 957 | return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d) |
958 | 958 | ||
959 | @trace_unhandled_exceptions | 959 | @trace_unhandled_exceptions |
... | @@ -1169,7 +1169,7 @@ if __name__ == "__main__": | ... | @@ -1169,7 +1169,7 @@ if __name__ == "__main__": |
1169 | 1169 | ||
1170 | sys.exit() | 1170 | sys.exit() |
1171 | elif opt == '--version': | 1171 | elif opt == '--version': |
1172 | - print("RNANet statistics 1.3 beta") | 1172 | + print("RNANet statistics 1.4 beta") |
1173 | sys.exit() | 1173 | sys.exit() |
1174 | elif opt == "-r" or opt == "--resolution": | 1174 | elif opt == "-r" or opt == "--resolution": |
1175 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1175 | assert float(arg) > 0.0 and float(arg) <= 20.0 | ... | ... |
-
Please register or login to post a comment