KBNLresearch · renevanderark · Jun 26, 2024
diff --git a/data/create_df_with_nlp_data.py b/data/create_df_with_nlp_data.py
@@ -4,37 +4,40 @@
 
 from spacy.lang.nl.examples import sentences
 
+nlp = spacy.load("../model/nl_core_news_sm")
+print("spacy geladen", flush=True)
+
 def process_text(text):
+    # Load the model from the relative path
     return nlp(text)
 
-rows_to_keep = range(0, 100)
-vaderland = pd.read_csv("HetVaderland_1873.csv", index_col=0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0, skiprows = lambda x: x not in rows_to_keep)
-standaard = pd.read_csv("DeStandaard_1873.csv", index_col = 0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0, skiprows = lambda x: x not in rows_to_keep)
-tijd = pd.read_csv("DeTijd_1873.csv", index_col = 0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0, skiprows = lambda x: x not in rows_to_keep)
+vaderland = pd.read_csv("HetVaderland_1873.csv", index_col=0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0)
+print("vaderland ingelezen: " + str(len(vaderland)), flush=True)
+standaard = pd.read_csv("DeStandaard_1873.csv", index_col = 0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0)
+print("standaard ingelezen: " + str(len(standaard)), flush=True)
+tijd = pd.read_csv("DeTijd_1873.csv", index_col = 0, delimiter=',', encoding='utf-8', quotechar='"', quoting=0)
+print("tijd ingelezen:      " + str(len(tijd)), flush=True)
 
 data = pd.concat([vaderland, standaard, tijd])
+print("data geplakt", flush=True)
 
 data["date"] = pd.to_datetime(data["date"])
-
 data['month'] = data['date'].dt.strftime("%B")
 data['day'] = data['date'].dt.strftime("%A")
 
 data = data.sort_values(by='date')
+print("data gesorteerd", flush=True)
 
 data = data.dropna(subset=['content'])
-
-# Specify the relative path to the model directory
-model_path = "../model/nl_core_news_sm"
-
-# Load the model from the relative path
-nlp = spacy.load(model_path)
+print("data gerefinet", flush=True)
 
 data["doc"] = data["content"].apply(process_text)
 
 # Serialize
 with open('processed_docs.pkl', 'wb') as f:
     pickle.dump(data, f)
 
+print("klaar met dumpen", flush=True)
 # # Deserialize
 # with open('processed_docs.pkl', 'rb') as f:
 #     processed_docs = pickle.load(f)
diff --git a/data/prune.py b/data/prune.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import pickle
+
+
+# Deserialize
+data = pd.read_pickle('processed_docs.pkl')
+reduced_data = data.sample(frac = 0.005)
+
+with open('reduced_docs.pkl', 'wb') as f:
+    pickle.dump(reduced_data, f)
diff --git a/data/reduced_docs.pkl b/data/reduced_docs.pkl