diff --git a/data/create_df_with_nlp_data.py b/data/create_df_with_nlp_data.py index 9336e28..8277b16 100644 --- a/data/create_df_with_nlp_data.py +++ b/data/create_df_with_nlp_data.py @@ -4,30 +4,32 @@ from spacy.lang.nl.examples import sentences +nlp = spacy.load("../model/nl_core_news_sm") +print("spacy geladen", flush=True) + def process_text(text): + # Load the model from the relative path return nlp(text) -rows_to_keep = range(0, 100) -vaderland = pd.read_csv("HetVaderland_1873.csv", index_col=0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0, skiprows = lambda x: x not in rows_to_keep) -standaard = pd.read_csv("DeStandaard_1873.csv", index_col = 0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0, skiprows = lambda x: x not in rows_to_keep) -tijd = pd.read_csv("DeTijd_1873.csv", index_col = 0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0, skiprows = lambda x: x not in rows_to_keep) +vaderland = pd.read_csv("HetVaderland_1873.csv", index_col=0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0) +print("vaderland ingelezen: " + str(len(vaderland)), flush=True) +standaard = pd.read_csv("DeStandaard_1873.csv", index_col = 0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0) +print("standaard ingelezen: " + str(len(standaard)), flush=True) +tijd = pd.read_csv("DeTijd_1873.csv", index_col = 0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0) +print("tijd ingelezen: " + str(len(tijd)), flush=True) data = pd.concat([vaderland, standaard, tijd]) +print("data geplakt", flush=True) data["date"] = pd.to_datetime(data["date"]) - data['month'] = data['date'].dt.strftime("%B") data['day'] = data['date'].dt.strftime("%A") data = data.sort_values(by='date') +print("data gesorteerd", flush=True) data = data.dropna(subset=['content']) - -# Specify the relative path to the model directory -model_path = "../model/nl_core_news_sm" - -# Load the model from the relative path -nlp = spacy.load(model_path) +print("data gerefinet", flush=True) data["doc"] = data["content"].apply(process_text) @@ -35,6 +37,7 @@ def process_text(text): with open('processed_docs.pkl', 'wb') as f: pickle.dump(data, f) +print("klaar met dumpen", flush=True) # # Deserialize # with open('processed_docs.pkl', 'rb') as f: # processed_docs = pickle.load(f) diff --git a/data/prune.py b/data/prune.py new file mode 100644 index 0000000..63cb1d9 --- /dev/null +++ b/data/prune.py @@ -0,0 +1,10 @@ +import pandas as pd +import pickle + + +# Deserialize +data = pd.read_pickle('processed_docs.pkl') +reduced_data = data.sample(frac = 0.005) + +with open('reduced_docs.pkl', 'wb') as f: + pickle.dump(reduced_data, f) \ No newline at end of file diff --git a/data/reduced_docs.pkl b/data/reduced_docs.pkl new file mode 100644 index 0000000..b34634b Binary files /dev/null and b/data/reduced_docs.pkl differ