-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathread_get_gene_seq.py
More file actions
132 lines (112 loc) · 4.47 KB
/
read_get_gene_seq.py
File metadata and controls
132 lines (112 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import json
from Bio import SeqIO
import os
import gzip
import progressbar
def read_from_multiple_lsy(lsyfl):
lsy = {}
for f in lsyfl:
d = {}
with open(f, "r") as file:
d = dict(json.load(file))
for t in d:
lsy[t] = d[t]
return lsy
# this function updates the given dictionary with the given keys and
# values list
def create_dict(keys, values, dictionary):
for i in range(len(keys)):
if keys[i] not in dictionary:
dictionary[keys[i]] = values[i]
return dictionary
# this function maps all the genes to their respective species.
# (Function: when finding the species of any gene we do not need to search the entire dataframe)
def group_seq_by_species(df, g_to_sp):
sph = list(df.homology_species)
ghsp = list(df.homology_gene_stable_id)
sp = list(df.species)
gsp = list(df.gene_stable_id)
create_dict(gsp, sp, g_to_sp)
create_dict(ghsp, sph, g_to_sp)
return g_to_sp
# this function returns the gene-id and gene-biotype from the description
# in the fasta file record.
def description_cleaner(description):
description = description.split()
t = ""
gbt = ""
for x in description:
try:
x = x.split(":")
if x[0] == "gene":
t = x[1].split(".")[0]
if x[0] == "gene_biotype":
gbt = x[1]
except BaseException:
return "aa", "aa"
return t, gbt
def read_gene_seq(dirname, s, genes_by_species):
lof = os.listdir(dirname) # list all the files in the sequences directory
ftr = []
for f in lof:
if f.split(".")[
0] in s: # check whether the species is present in the species to read list. Will skip those species which are not present in the dataframe
ftr.append(f)
data = {}
for f in progressbar.progressbar(ftr):
species = f.split(".")[0].lower()
with gzip.open(dirname + "/" + f, "rt") as file:
record = SeqIO.parse(file, "fasta")
for r in record:
gid, gbt = description_cleaner(r.description)
if str(gid) not in data and str(
gid) in genes_by_species[species] and gbt == "protein_coding":
data[gid] = str(r.seq)
return data
def read_gene_sequences(hdf, lsy, data_dir, fname):
"""The basic idea here is to create a list/dictionary of all the genes by their species.
Once the mapping is done, all the respective fasta sequence files are read by Species
and the CDNA sequences for each gene in the species record are read and stored.
Thus we don't have to read the same file multiple times."""
grouped_genes = {}
gene_by_species_dict = {}
for df in progressbar.progressbar(hdf):
grouped_genes = group_seq_by_species(df, grouped_genes)
for i in df.homology_species.unique():
gene_by_species_dict[i] = []
for i in df.species.unique():
gene_by_species_dict[i] = []
for x in progressbar.progressbar(lsy):
try:
species = grouped_genes[x] # get the species
except BaseException:
continue
# check if the gene already exists in the species dict or not.
if x not in gene_by_species_dict[species]:
gene_by_species_dict[species].append(x)
xl = lsy[x]['b']
xr = lsy[x]['f']
for gxl in xl:
if gxl == "NULL_GENE":
break
if gxl not in gene_by_species_dict[species]:
gene_by_species_dict[species].append(gxl)
for gxr in xr:
if gxr == "NULL_GENE":
break
if gxr not in gene_by_species_dict[species]:
gene_by_species_dict[species].append(gxr)
# select those species only whose gene sequences we have to read.
s = [x for x in gene_by_species_dict if len(gene_by_species_dict[x]) != 0]
s = [x.capitalize() for x in s]
data = read_gene_seq(data_dir, s, gene_by_species_dict)
not_found = {}
for species in gene_by_species_dict:
for gene in gene_by_species_dict[species]:
try:
_ = data[gene]
except BaseException:
not_found[gene] = 1
with open("processed/not_found_" + fname + ".json", "w") as file:
json.dump(not_found, file)
return data