-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathverse-trigram.py
46 lines (37 loc) · 1.5 KB
/
verse-trigram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from nltk import ngrams
from nltk.tokenize import word_tokenize
import json
import os
import nltk
from tqdm import tqdm
from collections import Counter
nltk.download('punkt')
input_file = "/mnt/sdc/genius/verse/verse-clean.jsonl"
output_folder = "/mnt/sdc/genius/verse/trigram"
os.makedirs(output_folder, exist_ok=True)
batch_size = 100000
file_count = 1
lines_processed = 0
batch_data = []
with open(input_file, 'r', encoding='utf-8') as infile:
lines = infile.readlines()
for line in tqdm(lines, desc="Processing", unit="line"):
data = json.loads(line)
lyrics = data.get("lyrics", "")
song_id = data.get("song_id", "")
tokens = word_tokenize(lyrics)
trigram_counts = Counter(ngrams(tokens, 3))
top_20_trigrams = trigram_counts.most_common(20)
batch_data.append(json.dumps({"song_id": song_id, "trigrams": top_20_trigrams}))
lines_processed += 1
if lines_processed % batch_size == 0:
output_file = os.path.join(output_folder, f"verse-tri-{file_count:05d}.jsonl")
with open(output_file, 'w', encoding='utf-8') as outfile:
outfile.write("\n".join(batch_data) + "\n")
batch_data = []
file_count += 1
if batch_data:
output_file = os.path.join(output_folder, f"verse-tri-{file_count:05d}.jsonl")
with open(output_file, 'w', encoding='utf-8') as outfile:
outfile.write("\n".join(batch_data) + "\n")
print("Top 20 trigrams per song have been saved in batches of 100K.")