-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenerator.py
190 lines (148 loc) · 5.99 KB
/
Generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# Generator () returns a generator object
(2 * num for num in range(5))
# List comprehension [] returns a list
[2 * num for num in range(5)]
# Both can be iterated over
num_gen = (2 * num for num in range(5))
for num in num_gen:
print(num)
num_gen = [2 * num for num in range(5)]
for num in num_gen:
print(num)
# Convert a generator to a list
num_gen = (2 * num for num in range(5))
print(list(num_gen))
# Calculate the length of each string element
person_list = ['John', 'Katie', 'Angel', 'Julia', 'Peter']
lengths = (len(person) for person in person_list)
for value in lengths:
print(value)
# When we prefer generator over list comprehension?
# We prefer the generator when working with *extremely large sequences,
# as we don't want to store the entire list *in memory, which is what list comprehension would do.
# We want to generate elements of the sequence on the fly.
(2 * num for num in range(2 ** 10000000000)] # Prefer
print(next(num_gen))
print(next(num_gen))
print(next(num_gen))
[2 * num for num in range(2 ** 10000000000)]
# Use generators load a very large (streaming) data file line by line
# Case 1: process the first 500 rows of a file line by line, to create a dictionary of the counts of how many times each country appears in a column in the dataset.
counts_dict = {}
# Define read_large_file()
def read_large_file(file_object):
"""A generator function to read a large file line by line on the fly."""
while True: # Loop indefinitely until the end of the file
data = file_object.readline() # Read a line from the file: data
if not data: # Break if this is the end of the file
break
yield data # Yield the line of data
# Open a connection to a target file using a context manager 'with' statement, ensuring that resources are efficiently allocated when opening a connection to a file.
with open('large_data.csv') as file:
file.readline() # Initialize an empty dictionary: counts_dict
counts_dict = {} # Initialize an empty dictionary: counts_dict
# Process only the first 500 rows
for j in range(0,500):
#! Split the current line into a list: line
line = file.readline().split(',')
#! Return the value for the first column; in this case, it is the country name
first_col_val = line[0]
# If the column value is in the dict, increment its value
if first_col_val in counts_dict.keys():
counts_dict[first_col_val] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col_val] = 1
print(counts_dict)
# Case 1.1: process each row of a file line by line on the file, to create a dictionary of the counts of how many times each country appears in a column in the dataset.
counts_dict = {}
# Open a connection to the file
with open('large_data.csv') as file:
for line in read_large_file(file): # Iterate over the generator from read_large_file()
row = line.split(',')
first_col = row[0]
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
print(counts_dict)
# Case 2: bioinformatics scenarior: parse a very large genomic data file
def parse_fasta(file_path):
with open(file_path, 'r') as fasta_file:
sequence = ""
for line in fasta_file:
if line.startswith(">"):
if sequence:
yield sequence # Yield the previously read sequence
sequence = ""
else:
sequence += line.strip() # Concatenate lines of the sequence with leading and trailing whitespace removed
if sequence:
yield sequence # Yield the last sequence in the file
# Usage example
fasta_file_path = "large_genomic_data.fasta"
sequences = parse_fasta(fasta_file_path)
# Process sequences one by one using next()
try:
while True:
sequence = next(sequences)
print("Processing sequence:", sequence[:20], "...")
except StopIteration:
print("Finished processing all sequences.")
# Generators in printing the file paths
# os.walk() yields a 3-tuple including dirpath, dirnames, filenames.
# _: capture the dirnames, but ignore them.
import os
for root, dirs, files in os.walk("./data/raw/"):
for dir in dirs:
print(os.path.join(root, dir))
for file in files:
print(os.path.join(root, file))
for root, _, files in os.walk("./data/raw/"):
for file in files:
print(os.path.join(root, file))
# Conditionals in the generator
aa = ['histidine', 'isoleucine', 'leucine', 'lysine', 'methionine', 'phenylalanine', 'threonine', 'tryptophan', 'valine']
aa_target = (x for x in aa if "t" in x)
print(list(aa_target))
# Build the generator function that produces the generator objects when called.
# This function yields a sequence of values instead of returning a single value, using the keyword `yield`.
"""
Case 1
"""
def yield_gene_expression():
genes = ["Gene1", "Gene2", "Gene3", "Gene4", "Gene5"]
values = [10.5, 15.2, 8.7, 12.0, 9.8]
for gene, value in zip(genes, values):
yield f"Gene: {gene}, Expression: {value} TPM"
gene_expression_generator = yield_gene_expression()
print(next(gene_expression_generator))
print(next(gene_expression_generator))
print(next(gene_expression_generator))
print(next(gene_expression_generator))
print(next(gene_expression_generator))
# Gene: Gene5, Expression: 9.8 TPM
"""
Case 2
"""
def num_sequence(n):
"""Generate values from 0 to n."""
i = 0
while i < n:
yield i
i += 1
result = num_sequence(7)
print(type(result)) # <class 'generator'>
for item in result:
print(item)
"""
Case 3
"""
person_list = ['John', 'Katie', 'Angel', 'Julia', 'Peter']
# Define generator function get_lengths that allows users to input any list
def get_lengths(input_list):
"""Generator function that yields the length of the strings in input_list."""
for person in input_list:
yield len(person)
for value in get_lengths(person_list):
print(value)