statistical_potential.py 23.9 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
#!/usr/bin/python3

# RNANet statistics
# Developed by Aglaé Tabot, 2021 

# This file computes statistical potentials over the produced dataset.
# THIS FILE IS NOT SUPPOSED TO BE RUN DIRECTLY.

import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings
import time
import numpy as np
import pandas as pd
import threading as th
import matplotlib
import matplotlib.pyplot as plt
import json
import Bio
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral
from multiprocessing import Pool, Manager
from os import path
from tqdm import tqdm
from collections import Counter
from setproctitle import setproctitle
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_with_tqdm, trace_unhandled_exceptions
from sklearn.mixture import GaussianMixture
import scipy.stats as st
import warnings
from pandas.core.common import SettingWithCopyWarning
from itertools import combinations_with_replacement
from math import *
from geometric_stats import get_euclidian_distance, GMM_histo

np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)

runDir = os.getcwd()

@trace_unhandled_exceptions
def pyle_measures_for_potentials(name, s, thr_idx):
    # measure distances P-P, P-C1', P-C4', C1'-C1', C4'-C4'
    # between residues along the chain
    # Requires a lot of storage space!
    if (path.isfile(runDir + '/results/geometry/Pyle/distances_i_i+1/distances_pyle_i_i+1'+name+'.csv')):
        return

    liste_dist=[]

    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})")

    chain = next(s[0].get_chains())
    
    for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False):
       
        if res1.get_resname() in ["A", "C", "G", "U"]:
            resnum1=list(res1.get_id())[1]
            atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() ==  "P"]
            atom_c1p_1 = [ atom.get_coord() for atom in res1 if "C1'" in atom.get_fullname() ]
            atom_c4p_1 = [ atom.get_coord() for atom in res1 if "C4'" in atom.get_fullname() ]
            for res2 in chain:
                resnum2=list(res2.get_id())[1]
                if resnum2-resnum1 < 1 :
                    continue
                p_p=np.nan
                p_c4p=np.nan
                p_c1p=np.nan
                c4p_c4p=np.nan
                c1p_c1p=np.nan
                
                if res2.get_resname() in ["A", "C", "G", "U"]:
                    
                    atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() ==  "P"]
                    atom_c1p_2 = [ atom.get_coord() for atom in res2 if "C1'" in atom.get_fullname() ]
                    atom_c4p_2 = [ atom.get_coord() for atom in res2 if "C4'" in atom.get_fullname() ]

                    p_p = get_euclidian_distance(atom_p_1, atom_p_2)
                    p_c4p= get_euclidian_distance(atom_p_1, atom_c4p_2)
                    p_c1p= get_euclidian_distance(atom_p_1, atom_c1p_2)
                    c4p_c4p= get_euclidian_distance(atom_c4p_1, atom_c4p_2)
                    c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2)

                    liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p])

                
    df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"])
    df.to_csv(runDir + "/results/geometry/Pyle/distances_i_i+1/distances_pyle_i_i+1" + name + ".csv")

@trace_unhandled_exceptions
def gmm_pyle_type(ntpair, data, scan):

    setproctitle(f"GMM (Pyle {ntpair} )")
    
    os.makedirs(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/", exist_ok=True)
    os.chdir(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/")

    p_p=list(data["P-P"][~ np.isnan(data["P-P"])])
    p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])])
    p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])])
    c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])])
    c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])])
    
    GMM_histo(p_p, f"Distance P-P between {ntpair}", scan, toric=False, hist=False, col="cyan")
    GMM_histo(p_c4p, f"Distance P-C4' between {ntpair}", scan, toric=False, hist=False, col="tomato")
    GMM_histo(p_c1p, f"Distance P-C1' between {ntpair}", scan, toric=False, hist=False, col="goldenrod")
    GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair}", scan, toric=False, hist=False, col="magenta")
    GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair}", scan, toric=False, hist=False, col="black")
    
    plt.xlabel("Distance (Angströms)")
    
    plt.title(f"GMM of distances for {ntpair} ", fontsize=10)

    plt.savefig(runDir + "/results/figures/GMM/Pyle/distances_i_i+1/" + f"Distances_Pyle_{ntpair}.png" )
    plt.close()
    setproctitle(f"GMM (Pyle {ntpair} distances) finished")
@trace_unhandled_exceptions
def gmm_pyle_per_type(scan):

    setproctitle("GMM (Pyle model)")

    df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances_i_i+1/distances.csv"))

    data=df
    if len(data):
        for b1 in ['A','C','G','U']:
            for b2 in ['A','C','G','U']:
                thisbases = data[(data.res1 == b1)&(data.res2 == b2)]
                if len(thisbases):
                    gmm_pyle_type(b1+b2, thisbases, scan)
    setproctitle("GMM (Pyle model) finished")

# The next 7 functions are used to calculate the statistical potentials with the averaging method (residue-averaging)
# for the Pyle model from the GMM results, and to plot the corresponding figures


def pgaussred(x):
    """
    distribution function of the normal distribution (= probability that a random variable distributed according to is less than x)
    calculation by numerical integration: 2000 slices per standard deviation
    result rounded to 7 decimal places
    """
    if x==0:
        return 0.5
    u=abs(x)
    n=int(u*2000)
    du=u/n
    k=1/sqrt(2*pi)
    u1=0
    f1=k
    p=0.5
    for i in range(0,n):
        u2=u1+du
        f2=k*exp(-0.5*u2*u2)
        p=p+(f1+f2)*du*0.5
        u1=u2
        f1=f2
    if x<0:
        p = 1.0-p
    return round(p, 7)

def proba(m, s, xinf, xsup):
    """
    calculates the probability for a value to belong to the interval [xinf, xsup]
    for a normal distribution with mean m and standard deviation s
    """
    prob=pgaussred((xsup-m)/s)-pgaussred((xinf-m)/s)

    return prob

def extract_from_json(data, xinf, xsup):
    """
    extracts the means and standard deviations of the json obtained with the GMM calculations
    and calculates from these parameters the probability for a value to belong to the interval [xinf, xsup].
    """
    p=[]
    p_tot=0
    classes=[]
    with open(data + '.json') as json_data:
        data_dict = json.load(json_data)

    for i in range(len(data_dict['means'])):
        mean = data_dict['means'][i]
        if mean[0] == '[':
            mean=float((mean.split('['))[1].split(']')[0])
        else :
            mean=float(mean)
        std = data_dict['std'][i]
        if std[0] == '[':
            std=float((std.split('['))[2].split(']')[0])
        else:
            std=float(std)
        prob=proba(mean, std, xinf, xsup)
        p.append(prob)

    for x in p:
        p_tot = p_tot+x
    return p_tot

def averaging(liste_data, name):
    """
    creates a json that contains all the means and standard deviations of the parameters
    that we want to use to create the reference distribution with the averaging method
    """
    
    curves=[]
 
    x = np.linspace(0,250,1000)

    summary_data = {}
    summary_data["measures"] = []
    summary_data["means"] = []
    summary_data["std"] = []
    for data in  liste_data:
        s=0
        mean_tot=0
        std_tot=0
        with open(runDir + '/results/geometry/json/' + data + '.json') as json_data:
            data_dict = json.load(json_data)

        for i in range(len(data_dict['means'])):
            mean = float(data_dict['means'][i])
  
            std = float(data_dict['std'][i])
            
            weight = float(data_dict['weights'][i])
            y = weight*st.norm.pdf(x, mean, std)
            mean_tot=mean_tot+(weight*mean)
            std_tot=std_tot+(weight*std)
          
            curves.append(y)
        summary_data["measures"].append(data)
        summary_data["means"].append(str(mean_tot))
        summary_data["std"].append(str(std_tot))
 
    with open(runDir + '/results/statistical_potential/json/Pyle/avg_' +name + ".json", 'w', encoding='utf-8') as f:
        json.dump(summary_data, f, indent=4)

def potential_dist(data_obs, data_ref):
    name=data_obs.split('/')[-1]
    l=[]
    for k in range(0, 400, 5):
        obs=extract_from_json(data_obs, k, k+5)
        ref=extract_from_json(data_ref, k, k+5)
        if obs != 0.0 and ref != 0.0:
            u = -log(obs/ref)
            l.append([k, k+5, obs, ref, u])
            l.append([k+5, k+5, obs, ref, u])

        else:
            l.append([k, k+5, obs, ref, None])
            l.append([k+5, k+5, obs, ref, None])
    df=pd.DataFrame(l, columns=["Xinf", "Xsup", "Pobs", "Pref", "- ln(Pobs/Pref)"])
    df.to_csv(runDir + '/results/statistical_potential/ratio/Pyle/Statistical potential ' + name + ".csv")

def courbe_stat_pot(f):
    name=f.split('/')[-1]
    name=name.split('.')[0]
    df=pd.read_csv(f)
    E=list(df["- ln(Pobs/Pref)"][~ np.isnan(df["- ln(Pobs/Pref)"])])
    max_E=max(E)
    min_E=min(E)
    index_with_nan=df.index[df.iloc[:,5].isnull()]
    for i in index_with_nan:
        df.iloc[i, 5]=max_E # we replace the nan by the maximum found energy
     

    y=list(df["- ln(Pobs/Pref)"])
    x=list(df["Xinf"])

    x=np.array(x)
    y=np.array(y)
    plt.plot(x, y)
    plt.xlabel("Distance (Angström)")
    plt.ylabel("- ln(Pobs/Pref)")
    plt.title(name)
    plt.savefig(runDir + "/results/statistical_potential/figures/Pyle/avg_statistical_pot_ " + name + ".png")
    plt.close()

def stat_potential_pyle():
    pyle = ["P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"]
    nt = ["A", "C", "G", "U"]
    for pair in pyle:
        type_list=[]
        for res1 in nt:
            for res2 in nt:
                setproctitle(f"RNANet statistics.py stat_potential_pyle({pair}, {res1}{res2})")
                type_list.append(f"Distance {pair} between {res1}{res2}")
                averaging(type_list, f"{pair}")
                for dist in type_list:
                    potential_dist(runDir + '/results/geometry/json/' + dist, runDir + f'/results/statistical_potential/json/Pyle/avg_{pair}')
                    courbe_stat_pot(runDir + '/results/statistical_potential/ratio/Pyle/Statistical potential ' + dist + '.csv')
                    setproctitle(f"RNANet statistics.py stat_potential_pyle({pair}, {res1}{res2}) finished")


@trace_unhandled_exceptions
def measures_heavy_atoms(name, s, thr_idx):
    """
    create a list of all possible pairs of atoms among the 85 types of atoms
    and of all classes of 5 Angstroms intervals between 0 and 150 (and >150)
    Calculate the distance between the different pairs 
    and count the number of occurrences in each distance class
    """

    if (path.isfile(runDir + '/results/geometry/all-atoms/distances_classes/distances_classes_occur_'+name+'.csv')):
        return

    #85 types of atoms
    A=['AP', 'AOP1', 'AOP2', "AO5'", "AC5'", "AC4'", "AO4'", "AC3'", "AO3'", "AC2'", "AO2'", "AC1'", 'AN9', 'AC8', 'AN7', 'AC5', 'AC6', 'AN6', 'AN1', 'AC2', 'AN3', 'AC4']
    C=['CP', 'COP1', 'COP2', "CO5'", "CC5'", "CC4'", "CO4'", "CC3'", "CO3'", "CC2'", "CO2'", "CC1'", 'CN1', 'CC2', 'CO2', 'CN3', 'CC4', 'CN4', 'CC5', 'CC6']
    G=['GP', 'GOP1', 'GOP2', "GO5'", "GC5'", "GC4'", "GO4'", "GC3'", "GO3'", "GC2'", "GO2'", "GC1'", 'GN9', 'GC8', 'GN7', 'GC5', 'GC6', 'GO6', 'GN1', 'GC2', 'GN2', 'GN3', 'GC4']
    U=['UP', 'UOP1', 'UOP2', "UO5'", "UC5'", "UC4'", "UO4'", "UC3'", "UO3'", "UC2'", "UO2'", "UC1'", 'UN1', 'UC2', 'UO2', 'UN3', 'UC4', 'UO4', 'UC5', 'UC6']
    
    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measures_heavy_atoms({name})")
    
    l=A+C+G+U
    comb=[]
    for i in combinations_with_replacement(l,2):
        # gives all the possible pairs (only once each) from the list of atoms -> 3655 pairs
        comb.append(i)

    list_comb=[]
    for x in comb:
        x=list(x)
        x=x[0]+'-'+x[1]
        list_comb.append(x)
  
    classes=[]
    # list for all classes of 5 Angstroms intervals between 0 and 150 (and >150)
    for i in range(0, 150, 5):
        classes.append([i, i+5])
    classes.append([150, 300])
    occurences=[]
    occurences=3655*[32*[0]] #3655 atom-pair types, 31 distance classes
    for i in range(len(occurences)):
        occurences[i]=[list_comb[i]]+31*[0] # 1 sublist per atom_pair type 

   
    chain = next(s[0].get_chains())
    coord_list=[]
    resseq=[]
    atom_types=[]
    for x in l:
        atom_types.append([x, 0])
    
    for res in chain:
        if res.get_resname() in ["A", "C", "G", "U"]:
            resseq.append(list(res.get_id())[1])
            # list of residue numbers
            for atom in res:
                coord_list.append([list(res.get_id())[1], res.get_resname(), atom.get_name(), atom.get_coord()])
                for i in range(len(atom_types)):
                    if res.get_resname()+atom.get_name()==atom_types[i][0]:
                        # count the number of atoms of each type
                        atom_types[i][1]=atom_types[i][1]+1
    
    nb_atom_types=pd.DataFrame(atom_types, columns=["atom", "count"])
    nb_atom_types.to_csv(runDir + '/results/geometry/all-atoms/atom_count/atom_count_'+name+'.csv')

    # creates a dataframe with the coordinates of the atoms, their names,
    # and the names and positions of the corresponding residues
    df=pd.DataFrame(coord_list, columns=["res_num", "res_name", "atom_name", "atom_coord"])

    # now, calculate the distances between all pairs of atoms along the chain
    for i in tqdm(range(1, max(df["res_num"])+1), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} measures_heavy_atoms", unit="res", leave=False):
        if i in resseq:
            res1=df[df["res_num"]==i]
            res_name1=res1["res_name"][res1.index[0]]
            for atom1 in res1.index:
                coor1=res1["atom_coord"][atom1]
                atom_name1=res1["atom_name"][atom1]

                for j in range(i+1, max(df["res_num"])+1):
                    if j in resseq:
                        res2=df[df["res_num"]==j]
                        if res2.shape[0]==0:
                            return
                        res_name2=res2["res_name"][res2.index[0]]
                        for atom2 in res2.index:
                            coor2=res2["atom_coord"][atom2]
                            atom_name2=res2["atom_name"][atom2]
                            # calculation of the distance between the 2 atoms
                            dist=get_euclidian_distance(coor1, coor2)
                           
                            for x in range(len(classes)):
                                    # we seek to which class this distance belongs
                                    if classes[x][0]<= dist< classes[x][1] : 
                                        for k in occurences:
                                            if res_name1+atom_name1+'-'+res_name2+atom_name2 == k[0] or res_name2+atom_name2+'-'+res_name1+atom_name1 == k[0]:
                                                # add 1 to the box corresponding to the type of distance and class identified
                                                k[x+1]=k[x+1]+1
                                                
    classes_str=[str(c) for c in classes]
    # creation of the dataframe: types of pairs of atoms in rows and classes in columns
    df_occur = pd.DataFrame(occurences, columns=["atom_pair_type"] + classes_str)
    # save this
    df_occur.to_csv(runDir + "/results/geometry/all-atoms/distances_classes/distances_classes_occur_" + name + ".csv")
@trace_unhandled_exceptions
def count_occur_atom_dist(fpath, outfilename):
    """
    After having calculated the number of occurrences of the distances between pairs of atoms
    sorted by distance class and by type of pair for each RNA chain,
    we add these occurrences one by one to obtain a single dataframe with the total number of occurrences
    """
  
    setproctitle(f"Addition of occurences of {fpath}")
    liste=os.listdir(fpath)

    df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()))

    for f in range(len(liste)):
        df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
        for i in range(df.shape[0]):
            for j in range(2, df.shape[1]):
                df_tot.iloc[i, j]=df_tot.iloc[i, j] + df.iloc[i, j]
        
    df_tot.to_csv(fpath+outfilename)

    setproctitle(f"Addition of occurences of {fpath} finished")

@trace_unhandled_exceptions
def mole_fraction(fpath, outfilename):
    """
    Calculation of the mole fraction of each type of atom within the set of structures
    """  
    setproctitle(f"Calculation of mole fractions of {fpath}")
    liste=os.listdir(fpath)
    
   
    df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()))
    del df_tot["Unnamed: 0"]

    for f in range(len(liste)):
        df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
        del df["Unnamed: 0"]
        for i in range(df.shape[0]):
                df_tot.iloc[i, 1]=df_tot.iloc[i, 1] + df.iloc[i, 1]

    total=sum(list(df_tot["count"]))
    fract=[]
    for i in range(df_tot.shape[0]):
       
        fract.append(df_tot.iloc[i, 1]/total)
    df_tot["mole_fraction"]=fract
    # file_list=os.listdir(fpath)
    # file_list=[fpath+x for x in file_list]
    # for f in file_list: #after processing, deletion of csv by structure
    #     os.remove(f)
    df_tot.to_csv(fpath+outfilename) 

    setproctitle(f"Calculation of mole fractions of {fpath} finished")

@trace_unhandled_exceptions
def compute_ratio_from_csv(fpath, avg_file, qch_file):
    """
    Calculation of observed and reference probabilities 
    according to the methods chosen to establish the reference state
    Then calculation of the Pobs / Pref ratio
    """ 
    setproctitle("Compute ratio from csv")
    df = pd.read_csv(fpath)
    del df['Unnamed: 0']
    del df['Unnamed: 0.1']

    # calculation of the reference probability for each distance class (averaging method)
    # if method=averaging
    pref_avg_list=[]
    sommes=[]
    s_tot=0
    for j in range(1, df.shape[1]):
        s=0
        for i in range(df.shape[0]):
            s=s+df.iloc[i,j]
        sommes.append(s)
    s_tot=sum(sommes)
    for s in sommes:
        pref_avg_list.append(s/s_tot)
   
    
    df_bis=df.copy()
    # if method=averaging
    for i in range(df.shape[0]):
        for j in range(1, df.shape[1]):
            # calculation of the observed probability for each atom_pair type in each distance class
            df_bis.iloc[i,j]=df.iloc[i,j]/(sum(df.iloc[i, k] for k in range(1, df.shape[1])))
            # ratio between the observed probability and the reference probability
            df_bis.iloc[i,j]=df_bis.iloc[i,j]/pref_avg_list[j-1]
        

    # if method=quasi-chemical
    df_ter=df.copy()
    atom_count=pd.read_csv(runDir + '/results/geometry/all-atoms/atom_count/atom_count.csv')
    for i in range(df.shape[0]):
        x=[] # mole fractions
        for k in range(atom_count.shape[0]):
            # calculate the mole fractions of the atom corresponding to the pair in row i
            if atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[0] or atom_count.at[k, 'atom']==df.iloc[i, 0].split('-')[1]:
                x.append(atom_count.at[k, 'mole_fraction'])
                
        for j in range(1, df.shape[1]):
            if len(x)==2:
                df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[1]*sommes[j-1]) # ratio for qchA method (Nijobs(r)/xi*xj*Nobs(r))
            if len(x)==1:
                df_ter.iloc[i, j]=df.iloc[i,j]/(x[0]*x[0]*sommes[j-1]) 
    df_bis.to_csv(runDir + '/results/statistical_potential/ratio/all-atoms/' + avg_file)
    df_ter.to_csv(runDir + '/results/statistical_potential/ratio/all-atoms/' + qch_file)

    setproctitle("Compute ratio from csv finished")
    
@trace_unhandled_exceptions
def sql_new_table(conn):
    cur = conn.cursor()
    sql = """ CREATE TABLE IF NOT EXISTS all_atoms (
                pot_id          INTEGER PRIMARY KEY NOT NULL,
                atom_pair       VARCHAR(10) NOT NULL,
                distance_bin    VARCHAR(15),
                avg_ratio_pobs_pref REAL,
                qch_ratio_pobs_pref       REAL,
                UNIQUE (atom_pair, distance_bin)
            );
         """
    cur.execute(sql)
    # conn.executescript(
    #     """ CREATE TABLE IF NOT EXISTS stat_potential (
    #             pot_id INTEGER NOT NULL PRIMARY KEY,
    #             res1            CHAR(1),
    #             atom1           CHAR(4),
    #             res2            CHAR(1),
    #             atom2           CHAR(4),
    #             method          VARCHAR(20),
    #             distance_bin    VARCHAR(15),
    #             ratio_pobs_pref REAL,
    #             log_ratio       REAL
    #         );
    #      """)
    # CREATE TABLE IF NOT EXISTS pyle (
    #             pot_id          INTEGER PRIMARY KEY NOT NULL,
    #             atom_pair       VARCHAR(10) NOT NULL,
    #             distance_bin    VARCHAR(15),
    #             avg_ratio_pobs_pref REAL,
    #             UNIQUE (atom_pair, distance_bin)
    #         );
    conn.commit()

    #cur.close()

    conn.execute("pragma journal_mode=wal")
    #conn.close()
@trace_unhandled_exceptions
def save_into_database():
    setproctitle("Saving statistical potentials(avg, qch) into database")
    df_avg = pd.read_csv(runDir + '/results/statistical_potential/ratio/all-atoms/avg_ratio_pobs_pref.csv')
    df_qch = pd.read_csv(runDir + '/results/statistical_potential/ratio/all-atoms/qch_ratio_pobs_pref.csv')
    ratio_list=[]
    del df_avg['Unnamed: 0']
    del df_qch['Unnamed: 0']
    for i in range(df_avg.shape[0]):
        for j in range(1,df_avg.shape[1]):

            ratio_list.append([df_avg.iloc[i,0], df_avg.columns[j], df_avg.iloc[i, j], df_qch.iloc[i,j]])

    
    with sqlite3.connect(runDir + "/results/Stat_potential.db", timeout=20.0) as conn:
        conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
        # We use the REPLACE keyword to get the latest information
        sql_execute(conn, """INSERT OR REPLACE INTO all_atoms (atom_pair, distance_bin, avg_ratio_pobs_pref, qch_ratio_pobs_pref )
                                VALUES (?, ?, ?, ?);""", 
                    many=True, 
                    data=ratio_list
                    ) 
    setproctitle("Saving statistical potentials(avg, qch) into database finished") 
@trace_unhandled_exceptions
def stat_potential(f, method):
    df=pd.read_csv(f)
    del df['Unnamed: 0']
    df.set_index('atom_pair_type', inplace=True)
    df=df.T
    setproctitle(f"RNANet statistics.py stat_potential({method})")
    for pair in df.columns:
        c=df[pair].tolist()
        new=[]
        for x in c:
            if x!=0.0 and not np.isnan(x):
                new.append(-log(x))
                new.append(-log(x))
            if x==0.0:
                new.append(0.0)
                new.append(0.0)
            if np.isnan(x):
                new.append(0.0)
                new.append(0.0)
        abs=[]
        
        for i in range(0, 150, 5):
            abs.append(i)
            abs.append(i+5)
        abs.append(150)
        abs.append(300)

        x=abs
        y=new
        x=np.array(x)
        y=np.array(y)
        plt.plot(x, y)
        plt.xlabel('Distance')
        plt.ylabel("- ln(Pobs/Pref)")
        plt.title(f'Statistical potential of {pair} distance ({method} method)')
        plt.savefig(runDir + f"/results/statistical_potential/figures/all-atoms/{method}_method/{method}_statistical_pot_{pair}.png")
        plt.close()

    setproctitle(f"RNANet statistics.py stat_potential({method}) finished")



if __name__ == "__main__":
    print("This file is not supposed to be run directly. Run statistics.py instead.")