-
Notifications
You must be signed in to change notification settings - Fork 1
/
exp_corpus_loader.py
49 lines (35 loc) · 1.48 KB
/
exp_corpus_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from news_loader import load_news
from text.cleanup import cleanup_text
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
n_jobs = multiprocessing.cpu_count()
def load_cleaned_news(remove_stopwords=True, stem=False):
news = load_news(fields=['id', 'title', 'subtitle', 'subject', 'portal',
'text'])
results = Parallel(n_jobs=n_jobs)(
delayed(cleanup_text)(article['text'],
article['title'],
article['subtitle'],
remove_stopwords,
stem) for article in news)
for idx, article in enumerate(news):
article['new_text'] = results[idx]
return news
def get_corpus_bow():
news = load_cleaned_news(remove_stopwords=True, stem=True)
labels = np.asarray([article['subject'] for article in news])
corpus = [article['new_text'] for article in news]
return corpus, labels
def get_corpus_tfidf():
return get_corpus_bow()
def get_corpus_doc2vec():
news = load_cleaned_news(remove_stopwords=True, stem=False)
labels = np.asarray([article['subject'] for article in news])
corpus = [article['new_text'] for article in news]
return corpus, labels
def get_corpus_nel():
news = load_cleaned_news(remove_stopwords=False, stem=False)
labels = np.asarray([article['subject'] for article in news])
corpus = [article['text'] for article in news]
return corpus, labels