-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvertdata.py
36 lines (28 loc) · 1.36 KB
/
convertdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np
import pandas as pd
import os
def preprocess_text_to_bytes(csv_path: str, npy_path: str):
"""
Converts a CSV file to a byte-level NPY file for training.
Args:
csv_path (str): Path to the CSV file containing text data.
npy_path (str): Path to save the resulting NPY file.
"""
if not os.path.exists(csv_path):
raise FileNotFoundError(f"CSV file not found: {csv_path}")
print(f"Processing {csv_path}...")
# Read the CSV file, ensuring we only load the 'text' column
df = pd.read_csv(csv_path, header=0, names=['index', 'text'], usecols=['text'], dtype=str)
# Drop NaN and empty rows
df['text'] = df['text'].fillna('').str.strip()
df = df[df['text'] != '']
# Concatenate all rows into a single text string
combined_text = " ".join(df['text'].values)
# Encode the text into bytes
byte_data = np.array(list(combined_text.encode('utf-8')), dtype=np.uint8)
# Save the byte array as an NPY file
np.save(npy_path, byte_data)
print(f"Saved byte data to {npy_path} (length: {len(byte_data)} bytes)")
# Convert training and validation datasets
preprocess_text_to_bytes("C:/projects/bytropix/data/wikitext_train.csv", "C:/projects/bytropix/data/wikitext_train.npy")
preprocess_text_to_bytes("C:/projects/bytropix/data/wikitext_test.csv", "C:/projects/bytropix/data/wikitext_val.npy")