Aglaé TABOT

statistical potentials

# before executing this script, you have to download the RNA-puzzles 3D structures files
# this script is used to transform the 3D RNA-puzzle structure files from PDB format to mmCIF format
# after that, we can perform the geometric measurements on these files by changing the file paths in geometric_stats.py
import Bio
from Bio.PDB import PDBParser
from Bio.PDB import MMCIFParser
import os
from os import path
import warnings
import numpy as np
import pandas as pd
from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral
from statistics import get_euclidian_distance, get_flat_angle, get_torsion_angle, pos_b1, pos_b2
from multiprocessing import Pool
runDir = os.getcwd()
os.makedirs(runDir+"/results_decoy/geometry/HiRE-RNA/distances/", exist_ok=True)
os.makedirs(runDir+"/results_decoy/geometry/HiRE-RNA/angles/", exist_ok=True)
os.makedirs(runDir+"/results_decoy/geometry/HiRE-RNA/torsions/", exist_ok=True)
def split_res_name(f):
name=f.split('.')[0]
if path.isfile(runDir+"/RNA-puzzles/rna_predict/"+name+".cif"):
return
with warnings.catch_warnings():
# Ignore the PDB problems. This mostly warns that some chain is discontinuous.
warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
parser=PDBParser()
s = parser.get_structure(name, os.path.abspath(runDir + "/RNA-puzzles/raw_dataset_and_for_assessment/rna_predict/" + f ))
new_s=Bio.PDB.Structure.Structure(s.get_id())
model=s[0]
new_model=Bio.PDB.Model.Model(model.get_id())
for chain in model:
new_chain=Bio.PDB.Chain.Chain(chain.get_id())
for res in chain:
res_atoms=res.get_atoms()
new_residu=Bio.PDB.Residue.Residue(res.get_id(), res.get_resname().split(' ')[1], res.get_segid())
for atom in list(res.get_atoms()):
new_atom=atom.copy()
new_residu.add(new_atom)
new_chain.add(new_residu)
new_model.add(new_chain)
new_s.add(new_model)
ioobj = Bio.PDB.MMCIFIO()
ioobj.set_structure(new_s)
ioobj.save(os.path.abspath(runDir +"/RNA-puzzles/rna_predict/"+name+".cif"))
liste=os.listdir(runDir + '/RNA-puzzles/raw_dataset_and_for_assessment/rna_predict')
def main():
with Pool(processes=50) as pool:
result=pool.map(split_res_name, [f for f in liste])
if __name__ == '__main__':
main()
# This script is used to run DSSR on the 3D structures of RNA-puzzles,
# and then to extract the values of torsion angles and pseudotorsions
import json
import os
import subprocess
import warnings
import pandas as pd
from multiprocessing import Pool
runDir = os.getcwd()
os.makedirs(runDir + "/RNA-puzzles/annotations/", exist_ok=True)
def annotate(fichier):
name=fichier.split('/')[-1]
puz_id=name.split('.')[0]
if (os.path.isfile(runDir +"/RNA-puzzles/annotations/" + puz_id + ".json")):
return
if name.split('.')[1]=='cif':
# run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
output = subprocess.run(["x3dna-dssr", f"-i={fichier}", "--json", "--auxfile=no", f"-o={puz_id}.json"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout = output.stdout.decode('utf-8')
stderr = output.stderr.decode('utf-8')
if "exception" in stderr:
# DSSR is unable to parse the chain.
warn(f"Exception while running DSSR, ignoring {puz_id}.", error=True)
return 1
else:
return
# save the analysis to file only if we can load it :/
json_file = open(runDir + "/RNA-puzzles/annotations/" +
puz_id + ".json", "w")
json_file.write(stdout)
json_file.close()
for f in os.listdir(runDir +"/RNA-puzzles/rna_predict"):
annotate(runDir +"/RNA-puzzles/rna_predict/"+ f)
os.makedirs(runDir + "/RNA-puzzles/torsions/", exist_ok=True)
os.makedirs(runDir +"/RNA-puzzles/pseudotorsions/", exist_ok=True)
def extract_3d_data(f):
pdb_id=f.split('.')[0]
if f.split('.')[1]=='json':
try :
with open(runDir + "/RNA-puzzles/rna_predict/" + pdb_id + ".json", 'r') as json_file:
json_object = json.load(json_file)
except json.decoder.JSONDecodeError as e:
#warn("Could not load "+pdb_id+f".json with JSON package: {e}", error=True)
return None
# Create the Pandas DataFrame for the nucleotides of the right chain
nts = json_object["nts"] # sub-json-object
df = pd.DataFrame(nts)
#print(df)
cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"]
df = df[cols_we_keep]
torsions=df[["index_chain", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "chi"]]
pseudotorsions=df[["eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base"]]
torsions.to_csv('/home/atabot/RNANet/RNA-puzzles/torsions/torsions ' + pdb_id + '.csv')
pseudotorsions.to_csv('/home/atabot/RNANet/RNA-puzzles/pseudotorsions/pseudotorsions ' + pdb_id + '.csv')
else:
return
l=os.listdir(runDir + "/RNA-puzzles/rna_predict/")
def main():
with Pool(processes=30) as pool:
result=pool.map(extract_3d_data, [f for f in l])
if __name__ == '__main__':
main()
This diff is collapsed. Click to expand it.