Bioinformatics-Assignment-6/process_data.py at master · Outfenneced/Bioinformatics-Assignment-6 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
Coder: William Thompson
Commenter: James Oakes
"""
import os  # <-- self explanatory
import pandas  # <-- self explanatory
import re  # <-- self explanatory
import csv  # <-- self explanatory
import numpy as np  # <-- imports numpy library and renames it to np
import matplotlib.pyplot as plt  # Imports matplotlib and renames it plt

INPUT_DIR = "data/YeastGenes"  # Sets the path of the Yeast Genome data
RNASEQ_DATA_PATH = "data/RNAseq_data.gff3"  # Sets the path of the RNA dataset
OUTPUT_DIR = "output"  # Sets the path for the output file with the processed data


def load_rnaseq_data(seqdata_path):  # Loads RNA dataset for processing
    rnaseq_data = dict()  # Sets the format of the dataset as dictionary
    parsed_file = pandas.read_csv(seqdata_path, comment="#", header=None, delimiter="\t")  # Parses the data

    feature_format = r"ID=(.*);Name=(.*);log2_transcription_level=(.*);Note=(.*)"  # Sets the format of the relevant data from the set (Name and Transcription Level)
    for row_num, row in parsed_file.iterrows(): # Iterates through the lines of the dataset and pulls the
        feature = row.values[-1]  # Finds last chunck of data (Tab delimited)
        regex_match = re.match(feature_format, feature)  # Finds the relevant data in the dataset
        orf_name_w_5utr = regex_match.group(1)  # Pulls the name of the gene
        orf_name = orf_name_w_5utr.replace("_5UTR", "")  # Removes the extra characters in the string
        transcription_level = regex_match.group(3)  # Pulls transcription level from the data
        rnaseq_data[orf_name] = transcription_level  # Associates transcription level with the gene name
    return rnaseq_data  # Outputs the name and transcription level


def calculate_gc_percent(sequence):
    count_gc = len(re.findall("[G|C]", sequence))  # Finds number of G's and C's in the sequence
    count_all = len(sequence)  # Finds overall length of sequence
    gc_percent = count_gc / count_all  # Expresses the GC content as # of G and C over the total length of the sequence
    return gc_percent


def update_file(input_dir, output_dir, rnaseq_info):  # Defines function to create updated files
    seq_data = list()  # Formats the data as a list
    dir_list = os.listdir(input_dir)  # Lists files in directory
    for file in dir_list:  # Iterates through the files in the input directory of Yeast genes
        path = os.path.join(input_dir, file)  # Creates path for each file
        with open(path) as f:  # Opens each file in the set Yeast files
            text = f.read().splitlines()  # Grabs the text of each file, and splits along lines
        orf_name = file.replace(".txt", "")  # Removes the .txt from each file name

        if orf_name in rnaseq_info:  # Checks to see if the Orf name is in the RNA dataset
            trans_level = rnaseq_info[orf_name]  # If it is, the transcription level is pulled from the list
        else:  # If not...
            trans_level = "NA"  # Set the transcription level to "NA"
        text[0] += " " + orf_name + " " + trans_level + "\n"  # Rewrites the text of the yeast gene file to include the Orf name and transcription level

        output_path = os.path.join(output_dir, file)  # Defines the file path to output the updated yeast info to
        with open(output_path, 'w') as of:  # Writes the data to a new file
            of.writelines(text)  # Inserts the text to the file

        # GC Content calculations
        if len(text) == 2:  # If the data has 2 lines, continue. Otherwise, ignore it. (Basically just for YEL076C
            gc_percent = calculate_gc_percent(text[1])
        else:  # If there isnt data...
            gc_percent = 0  # %GC = 0

        seq_data.append((orf_name, trans_level, gc_percent))  # Adds name, transcription level and GC content
    return seq_data  # Returns data to main code for plotting later


def plot_transcription_gc(seq_data):  # Beginning of plotting
    trans_level_list = [float(x[1]) for x in seq_data if x[1] != "NA"]  # Finds transcription level and sets it to X data
    gc_pct_list = [float(x[2]) for x in seq_data if x[1] != "NA"]  # Finds GC content and sets it to Y data

    plot = plt.scatter(trans_level_list, gc_pct_list, s=3)  # Plots Y vs X
    plt.xlabel("Transcription Level")  # Sets X label
    plt.ylabel("GC%")  # Sets Y label
    plt.title("Transcription Level vs GC%")  # Sets Title

    linear_regression = np.polyfit(trans_level_list, gc_pct_list, 1)  # Creates linear regression model for the data
    linear_x_vals = np.linspace(min(trans_level_list), max(trans_level_list), 2)  # Generates a matrix for the x values
    linear_y_vals = np.polyval(linear_regression, linear_x_vals)  # Uses regression to create the y values for the x values
    plt.plot(linear_x_vals, linear_y_vals, color="green")  # Creates the plot with a green line

    plt.show()  # Displays plot


if __name__ == '__main__':  # Tells python to run this if this is the file is being run
    rnaseq_info = load_rnaseq_data(RNASEQ_DATA_PATH)  # Opens RNA data
    os.makedirs(OUTPUT_DIR, exist_ok=True)  # Creates output directory
    seq_data = update_file(INPUT_DIR, OUTPUT_DIR, rnaseq_info)  # Calls funciton to update files

    with open("sequence_info.csv", "w") as csv_file:  # Creates csv with data
        writer = csv.writer(csv_file)  # Creates CSV file
        writer.writerows(seq_data)  # Writes data to file

    plot_transcription_gc(seq_data)  # calls plot creation function