Install_CaRNAval_RINs.py
3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/python3
# This script's purpose is to extract information about the CaRNAval
# RINS from a Python pickle object containing RINs from their RIN.py class.
# We do this because the official JSON file is hard to understand, and Antoine Soulé
# recommended the pickle.
import networkx, os, pickle, subprocess, sys
if __name__=="__main__":
rin_DIR = os.getcwd() + "/../data/modules/RIN/"
filename = "CaRNAval_1_as_dictionnary.nxpickled"
# Check that we can find CaRNAval RINs, and load the dataset
try:
sys.path.append(os.path.abspath(rin_DIR))
import RIN
except ImportError:
# We have to download it
subprocess.run(["wget", '-O', '../data/modules/carnaval_dataset.zip', "http://carnaval.lri.fr/carnaval_dataset.zip"])
subprocess.run(["unzip", '-ou', '../data/modules/carnaval_dataset.zip', "carnaval_dataset/CaRNAval_1_as_dictionnary.nxpickled", "carnaval_dataset/RIN.py"])
subprocess.run(["rm", "-f", "../data/modules/RIN/", "../data/modules/carnaval_dataset.zip"])
subprocess.run(["mv", "carnaval_dataset/", "../data/modules/RIN/"])
sys.path.append(os.path.abspath(rin_DIR))
import RIN
try:
objects = []
with (open(rin_DIR+filename, "rb")) as openfile:
while True:
try:
objects.append(pickle.load(openfile))
except EOFError:
break
print("Dataset loaded")
except OSError:
print("File not found : " + rin_DIR + filename)
exit(1)
# Creation of a directory to extract RINs from the pickle file to individual files
try:
os.makedirs(rin_DIR + "Subfiles", exist_ok=True)
except OSError:
print("Creation of the directory %s failed" % (rin_DIR + "Subfiles"))
exit(1)
# Loop on every CaRNAval module and extract it from the Python object to flat text file
n_modules = len(objects[0]) # ? to
for i in range(1,1+n_modules):
motif = objects[0][i].graph
f = open(rin_DIR + "Subfiles/" + str(i-1) + ".txt", "w+")
f.write("ntA,ntB,long_range;...\n")
components = []
comp = []
nodes = list(motif)
nodes.sort()
for node in nodes:
if comp == []:
comp.append(node)
else:
if comp[-1] + 1 != node : #not the same component
components.append(comp)
comp = []
comp.append(node)
else :
comp.append(node)
components.append(comp)
#print(nodes)
basepairs = ""
edges = list(motif.edges())
for a in edges:
if motif.edges[a]['label'] == 'CWW' :
ntA = nodes.index(a[0])
ntB = nodes.index(a[1])
if ntA <= ntB :
basepairs += str(ntA) + "," + str(ntB) + "," + str(motif.edges[a]['long_range']) + ";"
f.write(basepairs + "\n")
f.write("pos;k;seq\n")
num_nt = -1
for a in components:
seq = ""
data_comp = str(num_nt+1)
for b in a:
num_nt += 1
# sometimes in the nxpicled file, a node has the attribute "realnt",
# and sometimes "real_nt", but it's the same thing
try:
seq += motif.nodes[b]["realnt"]
except:
seq += motif.nodes[b]["real_nt"]
data_comp += "," + str(num_nt) + ";" + str(len(a)) + ";" + seq + "\n"
f.write(data_comp)
f.close()
# print(str(i-1) + ".txt created")
print("Successfully parsed "+filename, ", now individual RINs are saved in Subfiles/ folder.", sep='')