compara-deep-learning/read_get_gene_seq.py at master · EMBLEBIGSOC/compara-deep-learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import json
from Bio import SeqIO
import os
import gzip
import progressbar


def read_from_multiple_lsy(lsyfl):
    lsy = {}
    for f in lsyfl:
        d = {}
        with open(f, "r") as file:
            d = dict(json.load(file))
            for t in d:
                lsy[t] = d[t]
    return lsy

# this function updates the given dictionary with the given keys and
# values list


def create_dict(keys, values, dictionary):
    for i in range(len(keys)):
        if keys[i] not in dictionary:
            dictionary[keys[i]] = values[i]

    return dictionary

# this function maps all the genes to their respective species.
# (Function: when finding the species of any gene we do not need to search the entire dataframe)


def group_seq_by_species(df, g_to_sp):
    sph = list(df.homology_species)
    ghsp = list(df.homology_gene_stable_id)
    sp = list(df.species)
    gsp = list(df.gene_stable_id)
    create_dict(gsp, sp, g_to_sp)
    create_dict(ghsp, sph, g_to_sp)
    return g_to_sp

# this function returns the gene-id and gene-biotype from the description
# in the fasta file record.


def description_cleaner(description):
    description = description.split()
    t = ""
    gbt = ""
    for x in description:
        try:
            x = x.split(":")
            if x[0] == "gene":
                t = x[1].split(".")[0]
            if x[0] == "gene_biotype":
                gbt = x[1]
        except BaseException:
            return "aa", "aa"
    return t, gbt


def read_gene_seq(dirname, s, genes_by_species):
    lof = os.listdir(dirname)  # list all the files in the sequences directory
    ftr = []
    for f in lof:
        if f.split(".")[
                0] in s:  # check whether the species is present in the species to read list. Will skip those species which are not present in the dataframe
            ftr.append(f)
    data = {}
    for f in progressbar.progressbar(ftr):
        species = f.split(".")[0].lower()
        with gzip.open(dirname + "/" + f, "rt") as file:
            record = SeqIO.parse(file, "fasta")
            for r in record:
                gid, gbt = description_cleaner(r.description)
                if str(gid) not in data and str(
                        gid) in genes_by_species[species] and gbt == "protein_coding":
                    data[gid] = str(r.seq)
    return data


def read_gene_sequences(hdf, lsy, data_dir, fname):
    """The basic idea here is to create a list/dictionary of all the genes by their species.
       Once the mapping is done, all the respective fasta sequence files are read by Species
       and the CDNA sequences for each gene in the species record are read and stored.
       Thus we don't have to read the same file multiple times."""
    grouped_genes = {}
    gene_by_species_dict = {}
    for df in progressbar.progressbar(hdf):
        grouped_genes = group_seq_by_species(df, grouped_genes)
        for i in df.homology_species.unique():
            gene_by_species_dict[i] = []
        for i in df.species.unique():
            gene_by_species_dict[i] = []

    for x in progressbar.progressbar(lsy):
        try:
            species = grouped_genes[x]  # get the species
        except BaseException:
            continue
        # check if the gene already exists in the species dict or not.
        if x not in gene_by_species_dict[species]:
            gene_by_species_dict[species].append(x)
        xl = lsy[x]['b']
        xr = lsy[x]['f']
        for gxl in xl:
            if gxl == "NULL_GENE":
                break
            if gxl not in gene_by_species_dict[species]:
                gene_by_species_dict[species].append(gxl)
        for gxr in xr:
            if gxr == "NULL_GENE":
                break
            if gxr not in gene_by_species_dict[species]:
                gene_by_species_dict[species].append(gxr)

    # select those species only whose gene sequences we have to read.
    s = [x for x in gene_by_species_dict if len(gene_by_species_dict[x]) != 0]
    s = [x.capitalize() for x in s]

    data = read_gene_seq(data_dir, s, gene_by_species_dict)
    not_found = {}
    for species in gene_by_species_dict:
        for gene in gene_by_species_dict[species]:
            try:
                _ = data[gene]
            except BaseException:
                not_found[gene] = 1

    with open("processed/not_found_" + fname + ".json", "w") as file:
        json.dump(not_found, file)
    return data