-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvtt_to_csv.py
81 lines (68 loc) · 3.14 KB
/
vtt_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import re
from nltk import tokenize
import csv
def remove_tags(fileContents):
"""Removes <c> tags"""
replacement = re.sub(r'<c[.\w\d]*>', '', fileContents)
replacement = re.sub(r'</c[.\w\d]*>', '', replacement)
replacement = re.sub(r'</c>', '', replacement)
return replacement
def tokenize_text():
"""Extracts all the text from the vtt file, removes all the newlines and tokenizes the sentences into a list."""
vtt_files = [x for x in os.listdir("./") if x.endswith(".vtt")]
text = ''
for x in vtt_files:
print(f"Found {len(vtt_files)} .vtt file(s).")
with open("{}".format(x), 'r') as subtitles:
lines = subtitles.readlines()
for line in lines:
if re.search('^[0-9]+$', line) is None and re.search('^[0-9]{2}:[0-9]{2}:[0-9]{2}',
line) is None and re.search('^$', line) is None:
text += ' ' + line.rstrip('\n')
text = text.lstrip()
text = remove_tags(text)
return tokenize.sent_tokenize(text)
def find_matching_sentences():
"""Checks the csv file provided by the user which contains the word list and finds a match in the sentences
extracted with tokenize_text."""
csv_files = [x for x in os.listdir("./") if x.endswith(".csv")]
sentences = tokenize_text()
matching_sentences = []
for x in csv_files:
if x != 'words_with_sentences.csv':
with open("{}".format(x), 'r') as list_words:
reader = csv.reader(list_words)
for word in reader:
if word:
matching_sentence = [sentence for sentence in sentences if word[0].lower() in sentence.lower()]
matching_sentences.append(matching_sentence)
return matching_sentences
def create_list_words():
"""Creates a list with the words from the csv file provided by the user."""
csv_files = [x for x in os.listdir("./") if x.endswith(".csv")]
words = []
for x in csv_files:
if x != 'words_with_sentences.csv':
with open("{}".format(x), 'r') as list_words:
reader = csv.reader(list_words)
for word in reader:
if word:
words.append(word[0])
return words
def create_csv(matching_sentences, words):
"""Creates the csv file. The first column is the list of words, and the second column is the matching sentences.
If there is no matching sentence, the entry is left blank."""
with open("words_with_sentences.csv", "w") as words_with_sentences:
writer = csv.writer(words_with_sentences)
i = 0
while i < len(words):
# This line checks if there is a matching sentence.
if matching_sentences[i]:
writer.writerow([words[i], *matching_sentences[i]])
else:
writer.writerow([words[i], ""])
i += 1
if __name__ == '__main__':
create_csv(find_matching_sentences(), create_list_words())
print("The file(s) have been processed. A file called 'words_with_sentences.csv' has been created.")