searchForMotifSites.py 10.1 KB
#!/usr/bin/python3

from sys import argv
import subprocess
import inspect
from multiprocessing import Pool, TimeoutError, cpu_count
from os import path, makedirs, getcwd, chdir, devnull

# Retrieve Jar3D Paths from file EditMe
jar3dexec = ""
HLmotifDir = ""
ILmotifDir = ""
exec(compile(open(path.dirname(path.abspath(inspect.getfile(inspect.currentframe())))+"/EditMe").read(),'','exec'))

class Loop:
    def __init__(self, header, subsequence, looptype, position):
        self.header = header
        self.seq = subsequence
        self.type = looptype
        self.position = position
    def get_header(self):
        return self.header
    def subsequence(self):
        return self.seq

class InsertionSite:
    def __init__(self, loop, csv_line):
        # BEWARE : jar3d csv output is crap because of java's locale settings. 
        # On french OSes, it uses commas to delimit the fields AND as floating point delimiters !!
        # Parse with caution, and check what the csv output files look like on your system...
        info = csv_line.split(',') 
        self.loop = loop                    # the Loop object that has been searched with jar3d
        self.position = loop.position       # position of the loop's components, so the motif's ones, in the query sequence.
        self.atlas_id = info[2]             # Motif model identifier of the RNA 3D Motif Atlas
        self.score = int(float(info[4]))    # alignment score of the subsequence to the motif model
        self.rotation = int(info[-2])      # should the motif model be inverted to fit the sequence ?
    def __lt__(self, other):
        return self.score < other.score
    def __gt__(self, other):
        return self.score > other.score
    
def enumerate_loops(s):
    def printLoops(s,loops):
        print(s)
        for l in loops:
            i = 0
            m=''
            for c in l:
                while i<c[0]:
                    m+=' '
                    i+=1
                while i<c[1]+1:
                    m+='-'
                    i+=1
            while i<len(s):
                m+=' '
                i+=1
            print(m,"\tfound in position",l)

    def resort(unclosedLoops):
        loops.insert(len(loops)-1-unclosedLoops, loops[-1])
        loops.pop(-1)

    opened = []
    openingStart = []
    closingStart = []
    loops = []
    loopsUnclosed = 0
    consecutiveOpenings = []
    if s[0]=='(': 
        consecutiveOpenings.append(1)
    consecutiveClosings = 0

    lastclosed = -1
    previous = ''
    for i in range(len(s)):

        # If we arrive on an unpaired segment
        if s[i] == '.':
            if previous == '(': 
                openingStart.append(i-1)
            if previous == ')': 
                closingStart.append(i-1)

        # Opening basepair
        if s[i] == '(':
            if previous == '(':
                consecutiveOpenings[-1] += 1
            else:
                consecutiveOpenings.append(1)
            if previous == ')':
                closingStart.append(i-1)
                
            if len(openingStart) and openingStart[-1] == opened[-1]: # We have something like (...(
                loops.append( [(openingStart[-1],i)] ) # Create a new loop starting with this component.
                openingStart.pop(-1)
                loopsUnclosed += 1
            if len(closingStart) and closingStart[-1] == lastclosed: # We have something like )...( or even )(
                loops[-1].append( (closingStart[-1],i) ) # Append a component to existing multiloop
                closingStart.pop(-1)

            opened.append(i)

        # Closing basepair
        if s[i] == ')':
            if previous == ')':
                consecutiveClosings += 1
            else:
                consecutiveClosings = 1
            if previous == '(': # This is not supposed to happen in real data, but whatever.
                openingStart.append(i-1)

            if len(openingStart) and openingStart[-1] == opened[-1]: # We have something like (...) or ()
                loops.append( [(openingStart[-1], i)] ) # Create a new loop, and save it as already closed (HL)
                openingStart.pop(-1)
                resort(loopsUnclosed)
            if len(closingStart) and closingStart[-1] == lastclosed: # We have something like )...)
                loops[-1].append( (closingStart[-1],i) ) # Append a component to existing multiloop and close it.
                closingStart.pop(-1)
                loopsUnclosed -= 1
                resort(loopsUnclosed)

            if i+1<len(s):
                if s[i+1] != ')': # We are on something like: ).
                    if consecutiveClosings < consecutiveOpenings[-1]: # an openingStart has not been correctly detected, like in ...((((((...)))...)))
                        loops.append( [(opened[-2], opened[-1])] ) # Create a new loop (uncompleted)
                        loopsUnclosed+=1

                    if consecutiveClosings == consecutiveOpenings[-1]: # We just completed an HL+stem, like ...(((...))).., we can forget its info
                        consecutiveClosings = 0
                        consecutiveOpenings.pop(-1)
                    else: # There are still several basepairs to remember, forget only the processed ones, keep the others
                        consecutiveOpenings[-1] -= consecutiveClosings 
                        consecutiveClosings = 0

                else: # We are on something like: ))
                    if consecutiveClosings == consecutiveOpenings[-1]: # we are on an closingStart that cannot be correctly detected, like in ...(((...(((...))))))
                        loops[-1].append( (i, i+1) ) # Append a component to the uncomplete loop and close it.
                        loopsUnclosed -= 1
                        resort(loopsUnclosed)
                        consecutiveClosings = 0  # Forget the info about the processed stem.
                        consecutiveOpenings.pop(-1)

            opened.pop(-1)
            lastclosed = i

        previous = s[i]
        # print(i,"=",s[i],"\t", "consec. Op=", consecutiveOpenings,"Cl=",consecutiveClosings)

    printLoops(s, loops)
    return(loops)

def launchJar3d(loop):
    # write motif to a file
    newpath = getcwd()+'/'+loop.header[1:] 
    if not path.exists(newpath):
        makedirs(newpath)
    chdir(newpath)
    filename = loop.header[1:]+".fasta"
    fasta = open(filename,'w')
    fasta.write(loop.get_header()+'\n'+loop.subsequence()+'\n')
    fasta.close()

    # Launch Jar3D on it
    if loop.type == 'h':
        cmd = ["java", "-jar", jar3dexec, filename, HLmotifDir+"/all.txt", loop.header[1:]+".HLloop.csv", loop.header[1:]+".HLseq.csv"]
    else:
        cmd = ["java", "-jar", jar3dexec, filename, ILmotifDir+"/all.txt", loop.header[1:]+".ILloop.csv", loop.header[1:]+".ILseq.csv"]
    print(' '.join(cmd))
    nowhere = open(devnull, 'w')
    subprocess.call(cmd, stdout=nowhere)
    nowhere.close()

    # Retrieve results
    insertion_sites = []
    if loop.type == 'h':
        capstype = "HL"
    else:
        capstype = "IL"
    csv = open(loop.header[1:]+".%sseq.csv"%capstype, 'r')
    l = csv.readline()
    while l:
        if "true" in l:
            insertion_sites.append( InsertionSite(loop, l) )
        l = csv.readline()
    csv.close()

    # Cleaning
    chdir("..")
    subprocess.call(["rm","-r",loop.header[1:]])
    return insertion_sites

filename = argv[1]
basename = filename[0:filename.index('.')]

# Retrieving possible 2D structrures from RNAsubopt
print("Retrieving possible 2D structures from RNAsubopt...")
dbn = open(basename+".dbn","w")
subprocess.call(["RNAsubopt", "-i", filename], stdout=dbn)
dbn.close()
dbn = open(basename+".dbn","r")
dbn.readline()
s = dbn.readline().split(' ')[0]
structures = []
l = dbn.readline()
while l:
    structures.append(l.split(' ')[0])
    l = dbn.readline()
dbn.close()
subprocess.call(["rm", basename+".dbn"])
for ss in structures:
    print(ss)
print()


# Extracting probable loops from these structures
print("Extracting probable loops from these structures...")
HLs = []
ILs = []
for ss in structures:
    loop_candidates = enumerate_loops(ss)
    for loop_candidate in loop_candidates:
        if len(loop_candidate) == 1 and loop_candidate not in HLs:
            HLs.append(loop_candidate)
        if len(loop_candidate) == 2 and loop_candidate not in ILs:
            ILs.append(loop_candidate)
print("TOTAL:")
print(len(HLs),"probable hairpin loops found")
print(len(ILs),"probable internal loops")
print()

# Retrieve subsequences corresponding to the possible loops
print("Retrieving subsequences corresponding to the possible loops...")
loops = []
for i,l in enumerate(HLs):
    loops.append(Loop(">HL%d"%(i+1), s[l[0][0]-1:l[0][1]], "h", l))
    print()
    print(loops[-1].get_header(),"\t\t",l)
    print(loops[-1].subsequence())
for i,l in enumerate(ILs):
    loops.append(Loop(">IL%d"%(i+1), s[l[0][0]-1:l[0][1]]+'*'+s[l[1][0]-1:l[1][1]], "i", l))
    print()    
    print(">IL",i+1,"\t\t",l)
    print(loops[-1].subsequence())
print()

# Scanning loop subsequences against motif database
print("Scanning loop subsequences against motif database (using %d threads)..." % (cpu_count()-1))
pool = Pool(processes=cpu_count()-1)
insertion_sites = [ x for y in pool.map(launchJar3d, loops) for x in y ]
insertion_sites.sort(reverse=True)
print(len(insertion_sites), "insertions found:")

# Writing results to file
c = 0
resultsfile = open(basename+".sites.csv","w")
resultsfile.write("Motif,Rotation,Score,Start1,End1,Start2,End2\n")
for site in insertion_sites:
    if site.score > 10:
        c+=1 
        string = "FOUND with score %d:\t\t possible insertion of motif "% site.score + site.atlas_id
        if site.rotation:
            string += " (reversed)"
        string += " at positions"
        print(string, site.loop.subsequence(), '*'.join([str(x) for x in site.position]))
    resultsfile.write(site.atlas_id+','+str(bool(site.rotation))+",%d"%site.score+',')
    positions = [','.join([str(y) for y in x]) for x in site.position]
    if len(positions)==1:
        positions.append("-,-")
    resultsfile.write(','.join(positions)+'\n')
if c< len(insertion_sites):
    print("... and %d more with score(s) below 10."% (len(insertion_sites)-c))   
resultsfile.close()