diff --git a/Rock_fact_checker.py b/Rock_fact_checker.py
index f50db26..2de5220 100644
--- a/Rock_fact_checker.py
+++ b/Rock_fact_checker.py
@@ -1,46 +1,50 @@
-import streamlit as st
-
+import random
import time
import logging
from json import JSONDecodeError
-# from markdown import markdown
-# from annotated_text import annotation
-# from urllib.parse import unquote
-import random
+
+import streamlit as st
import pandas as pd
+import plotly.express as px
from app_utils.backend_utils import load_statements, query
-from app_utils.frontend_utils import set_state_if_absent, reset_results, entailment_html_messages
+from app_utils.frontend_utils import (
+ set_state_if_absent,
+ reset_results,
+ entailment_html_messages,
+ create_df_for_relevant_snippets,
+)
from app_utils.config import RETRIEVER_TOP_K
def main():
-
-
statements = load_statements()
# Persistent state
- set_state_if_absent('statement', "Elvis Presley is alive")
- set_state_if_absent('answer', '')
- set_state_if_absent('results', None)
- set_state_if_absent('raw_json', None)
- set_state_if_absent('random_statement_requested', False)
+ set_state_if_absent("statement", "Elvis Presley is alive")
+ set_state_if_absent("answer", "")
+ set_state_if_absent("results", None)
+ set_state_if_absent("raw_json", None)
+ set_state_if_absent("random_statement_requested", False)
-
- ## MAIN CONTAINER
st.write("# Fact checking 🎸 Rocks!")
st.write()
- st.markdown("""
+ st.markdown(
+ """
##### Enter a factual statement about [Rock music](https://en.wikipedia.org/wiki/List_of_mainstream_rock_performers) and let the AI check it out for you...
- """)
+ """
+ )
# Search bar
- statement = st.text_input("", value=st.session_state.statement,
- max_chars=100, on_change=reset_results)
+ statement = st.text_input(
+ "", value=st.session_state.statement, max_chars=100, on_change=reset_results
+ )
col1, col2 = st.columns(2)
col1.markdown(
- "", unsafe_allow_html=True)
+ "", unsafe_allow_html=True
+ )
col2.markdown(
- "", unsafe_allow_html=True)
+ "", unsafe_allow_html=True
+ )
# Run button
run_pressed = col1.button("Run")
# Random statement button
@@ -54,12 +58,15 @@ def main():
st.session_state.random_statement_requested = True
# Re-runs the script setting the random statement as the textbox value
# Unfortunately necessary as the Random statement button is _below_ the textbox
- # raise st.script_runner.RerunException(
- # st.script_request_queue.RerunData(None))
+ # Adapted for Streamlit>=1.12
+ raise st.runtime.scriptrunner.script_runner.RerunException(
+ st.runtime.scriptrunner.script_requests.RerunData("")
+ )
else:
st.session_state.random_statement_requested = False
- run_query = (run_pressed or statement != st.session_state.statement) \
- and not st.session_state.random_statement_requested
+ run_query = (
+ run_pressed or statement != st.session_state.statement
+ ) and not st.session_state.random_statement_requested
# Get results for query
if run_query and statement:
@@ -68,14 +75,14 @@ def main():
st.session_state.statement = statement
with st.spinner("🧠Performing neural search on documents..."):
try:
- st.session_state.results = query(
- statement, RETRIEVER_TOP_K)
+ st.session_state.results = query(statement, RETRIEVER_TOP_K)
time_end = time.time()
print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
- print(f'elapsed time: {time_end - time_start}')
+ print(f"elapsed time: {time_end - time_start}")
except JSONDecodeError as je:
st.error(
- "đź‘“ An error occurred reading the results. Is the document store working?")
+ "đź‘“ An error occurred reading the results. Is the document store working?"
+ )
return
except Exception as e:
logging.exception(e)
@@ -85,85 +92,36 @@ def main():
# Display results
if st.session_state.results:
results = st.session_state.results
- docs, agg_entailment_info = results['documents'], results['agg_entailment_info']
- print(results)
-
+ docs, agg_entailment_info = results["documents"], results["agg_entailment_info"]
+
+ # show different messages depending on entailment results
max_key = max(agg_entailment_info, key=agg_entailment_info.get)
message = entailment_html_messages[max_key]
- st.markdown(f'
{message}
', unsafe_allow_html=True)
- st.markdown(f'###### Aggregate entailment information:')
- st.write(results['agg_entailment_info'])
- st.markdown(f'###### Relevant snippets:')
-
- # colms = st.columns((2, 5, 1, 1, 1, 1))
- # fields = ["Page title",'Content', 'Relevance', 'contradiction', 'neutral', 'entailment']
- # for col, field_name in zip(colms, fields):
- # # header
- # col.write(field_name)
- df = []
- for doc in docs:
- # col1, col2, col3, col4, col5, col6 = st.columns((2, 5, 1, 1, 1, 1))
- # col1.write(f"[{doc.meta['name']}]({doc.meta['url']})")
- # col2.write(f"{doc.content}")
- # col3.write(f"{doc.score:.3f}")
- # col4.write(f"{doc.meta['entailment_info']['contradiction']:.2f}")
- # col5.write(f"{doc.meta['entailment_info']['neutral']:.2f}")
- # col6.write(f"{doc.meta['entailment_info']['entailment']:.2f}")
-
- # 'con': f"{doc.meta['entailment_info']['contradiction']:.2f}",
- # 'neu': f"{doc.meta['entailment_info']['neutral']:.2f}",
- # 'ent': f"{doc.meta['entailment_info']['entailment']:.2f}",
- # # 'url': doc.meta['url'],
- # 'Content': doc.content}
- #
- #
- #
- row = {'Title': doc.meta['name'],
- 'Relevance': f"{doc.score:.3f}",
- 'con': f"{doc.meta['entailment_info']['contradiction']:.2f}",
- 'neu': f"{doc.meta['entailment_info']['neutral']:.2f}",
- 'ent': f"{doc.meta['entailment_info']['entailment']:.2f}",
- # 'url': doc.meta['url'],
- 'Content': doc.content}
- df.append(row)
- st.dataframe(pd.DataFrame(df))#.style.apply(highlight))
-
-
- # if len(st.session_state.results['answers']) == 0:
- # st.info("""🤔 Haystack is unsure whether any of
- # the documents contain an answer to your question. Try to reformulate it!""")
-
- # for result in st.session_state.results['answers']:
- # result = result.to_dict()
- # if result["answer"]:
- # if alert_irrelevance and result['score'] < LOW_RELEVANCE_THRESHOLD:
- # alert_irrelevance = False
- # st.write("""
- # Attention, the
- # following answers have low relevance:
""",
- # unsafe_allow_html=True)
-
- # answer, context = result["answer"], result["context"]
- # start_idx = context.find(answer)
- # end_idx = start_idx + len(answer)
- # # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
- # st.write(markdown("- ..."+context[:start_idx] +
- # str(annotation(answer, "ANSWER", "#3e1c21", "white")) +
- # context[end_idx:]+"..."), unsafe_allow_html=True)
- # source = ""
- # name = unquote(result['meta']['name']).replace('_', ' ')
- # url = result['meta']['url']
- # source = f"[{name}]({url})"
- # st.markdown(
- # f"**Score:** {result['score']:.2f} - **Source:** {source}")
-
-# def make_pretty(styler):
-# styler.set_caption("Weather Conditions")
-# # styler.format(rain_condition)
-# styler.format_con(lambda v: v.float(v))
-# styler.background_gradient(axis=None, vmin=0, vmax=1, cmap="YlGnBu")
-# return styler
-
-def highlight(s):
- return ['background-color: red']*5
-main()
\ No newline at end of file
+ st.markdown(f"
{message}
", unsafe_allow_html=True)
+
+ st.markdown(f"###### Aggregate entailment information:")
+ col1, col2 = st.columns([2, 1])
+ df_agg_entailment_info = pd.DataFrame([results["agg_entailment_info"]])
+ fig = px.scatter_ternary(
+ df_agg_entailment_info,
+ a="contradiction",
+ b="neutral",
+ c="entailment",
+ size="contradiction",
+ )
+ with col1:
+ st.plotly_chart(fig, use_container_width=True)
+ with col2:
+ st.write(results["agg_entailment_info"])
+
+ st.markdown(f"###### Relevant snippets:")
+ df, urls = create_df_for_relevant_snippets(docs)
+ st.dataframe(df)
+
+ str_wiki_pages = "Wikipedia source pages: "
+ for doc, url in urls.items():
+ str_wiki_pages += f"[{doc}]({url}) "
+ st.markdown(str_wiki_pages)
+
+
+main()
diff --git a/app_utils/backend_utils.py b/app_utils/backend_utils.py
index 17bebb1..7c6036f 100644
--- a/app_utils/backend_utils.py
+++ b/app_utils/backend_utils.py
@@ -1,42 +1,61 @@
import shutil
+
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import Pipeline
-
import streamlit as st
from app_utils.entailment_checker import EntailmentChecker
+from app_utils.config import (
+ STATEMENTS_PATH,
+ INDEX_DIR,
+ RETRIEVER_MODEL,
+ RETRIEVER_MODEL_FORMAT,
+ NLI_MODEL,
+)
+
+
+@st.cache()
+def load_statements():
+ """Load statements from file"""
+ with open(STATEMENTS_PATH) as fin:
+ statements = [
+ line.strip() for line in fin.readlines() if not line.startswith("#")
+ ]
+ return statements
-from app_utils.config import STATEMENTS_PATH, INDEX_DIR, RETRIEVER_MODEL, RETRIEVER_MODEL_FORMAT, NLI_MODEL
# cached to make index and models load only at start
-@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None}, allow_output_mutation=True)
+@st.cache(
+ hash_funcs={"builtins.SwigPyObject": lambda _: None}, allow_output_mutation=True
+)
def start_haystack():
"""
load document store, retriever, reader and create pipeline
"""
- shutil.copy(f'{INDEX_DIR}/faiss_document_store.db', '.')
+ shutil.copy(f"{INDEX_DIR}/faiss_document_store.db", ".")
document_store = FAISSDocumentStore(
- faiss_index_path=f'{INDEX_DIR}/my_faiss_index.faiss',
- faiss_config_path=f'{INDEX_DIR}/my_faiss_index.json')
- print(f'Index size: {document_store.get_document_count()}')
-
+ faiss_index_path=f"{INDEX_DIR}/my_faiss_index.faiss",
+ faiss_config_path=f"{INDEX_DIR}/my_faiss_index.json",
+ )
+ print(f"Index size: {document_store.get_document_count()}")
+
retriever = EmbeddingRetriever(
document_store=document_store,
embedding_model=RETRIEVER_MODEL,
- model_format=RETRIEVER_MODEL_FORMAT
+ model_format=RETRIEVER_MODEL_FORMAT,
)
-
- entailment_checker = EntailmentChecker(model_name_or_path=NLI_MODEL,
- use_gpu=False)
-
+
+ entailment_checker = EntailmentChecker(model_name_or_path=NLI_MODEL, use_gpu=False)
pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=entailment_checker, name="ec", inputs=["retriever"])
return pipe
+
pipe = start_haystack()
+
# the pipeline is not included as parameter of the following function,
# because it is difficult to cache
@st.cache(persist=True, allow_output_mutation=True)
@@ -45,28 +64,28 @@ def query(statement: str, retriever_top_k: int = 5):
params = {"retriever": {"top_k": retriever_top_k}}
results = pipe.run(statement, params=params)
- scores, agg_con, agg_neu, agg_ent = 0,0,0,0
- for doc in results['documents']:
- scores+=doc.score
- ent_info=doc.meta['entailment_info']
- con,neu,ent = ent_info['contradiction'], ent_info['neutral'], ent_info['entailment']
- agg_con+=con*doc.score
- agg_neu+=neu*doc.score
- agg_ent+=ent*doc.score
-
- results['agg_entailment_info'] = {
- 'contradiction': round(agg_con/scores, 2),
- 'neutral': round(agg_neu/scores, 2),
- 'entailment': round(agg_ent/scores, 2)}
-
- return results
+ scores, agg_con, agg_neu, agg_ent = 0, 0, 0, 0
+ for i, doc in enumerate(results["documents"]):
+ scores += doc.score
+ ent_info = doc.meta["entailment_info"]
+ con, neu, ent = (
+ ent_info["contradiction"],
+ ent_info["neutral"],
+ ent_info["entailment"],
+ )
+ agg_con += con * doc.score
+ agg_neu += neu * doc.score
+ agg_ent += ent * doc.score
-@st.cache()
-def load_statements():
- """Load statements from file"""
- with open(STATEMENTS_PATH) as fin:
- statements = [line.strip() for line in fin.readlines()
- if not line.startswith('#')]
- return statements
+ # if in the first 3 documents there is a strong evidence of entailment/contradiction,
+ # there is non need to consider less relevant documents
+ if i == 2 and max(agg_con, agg_ent) / scores > 0.5:
+ results["documents"] = results["documents"][: i + 1]
+ break
-
\ No newline at end of file
+ results["agg_entailment_info"] = {
+ "contradiction": round(agg_con / scores, 2),
+ "neutral": round(agg_neu / scores, 2),
+ "entailment": round(agg_ent / scores, 2),
+ }
+ return results
diff --git a/app_utils/config.py b/app_utils/config.py
index 0dece9a..0f6259b 100644
--- a/app_utils/config.py
+++ b/app_utils/config.py
@@ -1,6 +1,5 @@
-
-INDEX_DIR = 'data/index'
-STATEMENTS_PATH = 'data/statements.txt'
+INDEX_DIR = "data/index"
+STATEMENTS_PATH = "data/statements.txt"
RETRIEVER_MODEL = "sentence-transformers/msmarco-distilbert-base-tas-b"
RETRIEVER_MODEL_FORMAT = "sentence_transformers"
diff --git a/app_utils/entailment_checker.py b/app_utils/entailment_checker.py
index fa39b3d..8868817 100644
--- a/app_utils/entailment_checker.py
+++ b/app_utils/entailment_checker.py
@@ -1,11 +1,12 @@
from typing import List, Optional
-from transformers import AutoModelForSequenceClassification,AutoTokenizer,AutoConfig
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
from haystack.nodes.base import BaseComponent
from haystack.modeling.utils import initialize_device_settings
from haystack.schema import Document, Answer, Span
+
class EntailmentChecker(BaseComponent):
"""
This node checks the entailment between every document content and the query.
@@ -38,29 +39,37 @@ def __init__(
tokenizer = tokenizer or model_name_or_path
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
- self.model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,revision=model_version)
+ self.model = AutoModelForSequenceClassification.from_pretrained(
+ pretrained_model_name_or_path=model_name_or_path, revision=model_version
+ )
self.batch_size = batch_size
self.model.to(str(self.devices[0]))
-
+
id2label = AutoConfig.from_pretrained(model_name_or_path).id2label
- self.labels= [id2label[k].lower() for k in sorted(id2label)]
- if 'entailment' not in self.labels:
- raise ValueError("The model config must contain entailment value in the id2label dict.")
-
+ self.labels = [id2label[k].lower() for k in sorted(id2label)]
+ if "entailment" not in self.labels:
+ raise ValueError(
+ "The model config must contain entailment value in the id2label dict."
+ )
+
def run(self, query: str, documents: List[Document]):
for doc in documents:
- entailment_dict=self.get_entailment(premise=doc.content, hypotesis=query)
- doc.meta['entailment_info']=entailment_dict
- return {'documents':documents}, "output_1"
-
+ entailment_dict = self.get_entailment(premise=doc.content, hypotesis=query)
+ doc.meta["entailment_info"] = entailment_dict
+ return {"documents": documents}, "output_1"
+
def run_batch():
pass
-
- def get_entailment(self, premise,hypotesis):
+
+ def get_entailment(self, premise, hypotesis):
with torch.no_grad():
- inputs = self.tokenizer(f'{premise}{self.tokenizer.sep_token}{hypotesis}', return_tensors="pt").to(self.devices[0])
+ inputs = self.tokenizer(
+ f"{premise}{self.tokenizer.sep_token}{hypotesis}", return_tensors="pt"
+ ).to(self.devices[0])
out = self.model(**inputs)
logits = out.logits
- probs = torch.nn.functional.softmax(logits, dim=-1)[0,:].cpu().detach().numpy()
- entailment_dict={k.lower():v for k,v in zip (self.labels, probs)}
- return entailment_dict
\ No newline at end of file
+ probs = (
+ torch.nn.functional.softmax(logits, dim=-1)[0, :].cpu().detach().numpy()
+ )
+ entailment_dict = {k.lower(): v for k, v in zip(self.labels, probs)}
+ return entailment_dict
diff --git a/app_utils/frontend_utils.py b/app_utils/frontend_utils.py
index 1061615..ffb0c9b 100644
--- a/app_utils/frontend_utils.py
+++ b/app_utils/frontend_utils.py
@@ -1,16 +1,45 @@
import streamlit as st
+import pandas as pd
+
+entailment_html_messages = {
+ "entailment": 'The knowledge base seems to confirm your statement',
+ "contradiction": 'The knowledge base seems to contradict your statement',
+ "neutral": 'The knowledge base is neutral about your statement',
+}
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
+
# Small callback to reset the interface in case the text of the question changes
def reset_results(*args):
st.session_state.answer = None
st.session_state.results = None
st.session_state.raw_json = None
-entailment_html_messages = {'entailment': 'The knowledge base seems to confirm your statement',
- 'contradiction': 'The knowledge base seems to contradict your statement',
- 'neutral': 'The knowledge base is neutral about your statement'}
+
+def highlight_cols(s):
+ coldict = {"con": "#FFA07A", "neu": "#E5E4E2", "ent": "#a9d39e"}
+ if s.name in coldict.keys():
+ return ["background-color: {}".format(coldict[s.name])] * len(s)
+ return [""] * len(s)
+
+
+def create_df_for_relevant_snippets(docs):
+ rows = []
+ urls = {}
+ for doc in docs:
+ row = {
+ "Title": doc.meta["name"],
+ "Relevance": f"{doc.score:.3f}",
+ "con": f"{doc.meta['entailment_info']['contradiction']:.2f}",
+ "neu": f"{doc.meta['entailment_info']['neutral']:.2f}",
+ "ent": f"{doc.meta['entailment_info']['entailment']:.2f}",
+ "Content": doc.content,
+ }
+ urls[doc.meta["name"]] = doc.meta["url"]
+ rows.append(row)
+ df = pd.DataFrame(rows).style.apply(highlight_cols)
+ return df, urls
diff --git a/data/statements.txt b/data/statements.txt
index ed7ae16..4a83fa6 100644
--- a/data/statements.txt
+++ b/data/statements.txt
@@ -17,4 +17,10 @@ Steve Vai collaborated with Frank Zappa
The White Stripes were a trio
The White Stripes were composed by Jack White and Meg White
Scorpions is a German trap band
-Sepultura is a heavy metal band
\ No newline at end of file
+Sepultura is a heavy metal band
+Toxicity is a song by System of a down
+System of a down is a Italian band
+The Cure is a pop band
+Mick Jagger loves pasta
+Ozzy Osbourne was part of the Black Sabbath
+Zucchero is an international artist
\ No newline at end of file
diff --git a/notebooks/get_wikipedia_data.ipynb b/notebooks/get_wikipedia_data.ipynb
index bff4753..8241b26 100644
--- a/notebooks/get_wikipedia_data.ipynb
+++ b/notebooks/get_wikipedia_data.ipynb
@@ -1 +1,582 @@
-{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Download data from Wikipedia","metadata":{}},{"cell_type":"code","source":"# install wikipedia API python wrapper\n! pip install wikipedia","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2022-08-20T21:43:59.293655Z","iopub.execute_input":"2022-08-20T21:43:59.294792Z","iopub.status.idle":"2022-08-20T21:44:15.263363Z","shell.execute_reply.started":"2022-08-20T21:43:59.294746Z","shell.execute_reply":"2022-08-20T21:44:15.262171Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"import wikipedia\nimport json\nimport traceback","metadata":{"execution":{"iopub.status.busy":"2022-08-20T21:44:15.265341Z","iopub.execute_input":"2022-08-20T21:44:15.265753Z","iopub.status.idle":"2022-08-20T21:44:15.470330Z","shell.execute_reply.started":"2022-08-20T21:44:15.265709Z","shell.execute_reply":"2022-08-20T21:44:15.468665Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"# titles to download, from https://en.wikipedia.org/wiki/List_of_mainstream_rock_performers\n\npages_titles=\"\"\"10cc\n10_Years_(band)\n3_Doors_Down\n311_(band)\n38_Special_(band)\nAccept_(band)\nAC/DC\nBryan_Adams\nAerosmith\nAFI_(band)\nAir_Supply\nThe_Alan_Parsons_Project\nAlice_in_Chains\nThe_All-American_Rejects\nThe_Allman_Brothers_Band\nAlter_Bridge\nAmbrosia_(band)\nAmerica_(band)\nThe_Animals\nAdam_Ant\nAnthrax_(American_band)\nApril_Wine\nArcade_Fire\nArctic_Monkeys\nAsia_(band)\nAudioslave\nAvenged_Sevenfold\nAwolnation\nThe_B-52's\nBachman–Turner_Overdrive\nBad_Company\nBadfinger\nThe_Band\nThe_Bangles\nBarenaked_Ladies\nBay_City_Rollers\nThe_Beach_Boys\nThe_Beatles\nBeck\nBen_Folds_Five\nPat_Benatar\nChuck_Berry\nThe_Big_Bopper\nBilly_Talent\nThe_Black_Crowes\nThe_Black_Keys\nBlack_Sabbath\nBlack_Stone_Cherry\nBlack_Veil_Brides\nBlink-182\nBloodhound_Gang\nBlue_October\nBlue_Öyster_Cult\nBlues_Traveler\nJames_Blunt\nBlur_(band)\nBon_Jovi\nBoston_(band)\nDavid_Bowie\nBowling_for_Soup\nBoys_Like_Girls\nBread_(band)\nBreaking_Benjamin\nBring_Me_the_Horizon\nJackson_Browne\nBuckcherry\nJeff_Buckley\nBullet_for_My_Valentine\nBush_(British_band)\nThe_Byrds\nCage_the_Elephant\nCake_(band)\nCanned_Heat\nThe_Cab\nThe_Cardigans\nThe_Cars\nCatfish_and_the_Bottlemen\nHarry_Chapin\nTracy_Chapman\nCheap_Trick\nChevelle_(band)\nChicago_(band)\nChubby_Checker\nCinderella_(band)\nDallas_Green_(musician)\nEric_Clapton\nThe_Clash\nEddie_Cochran\nJoe_Cocker\nCoheed_and_Cambria\nCold_Chisel\nColdplay\nCollective_Soul\nPhil_Collins\nAlice_Cooper\nChris_Cornell\nElvis_Costello\nCounting_Crows\nThe_Cranberries\nCrash_Test_Dummies\nCream_(band)\nCreed_(band)\nCreedence_Clearwater_Revival\nJim_Croce\nCrosby,_Stills,_Nash_&_Young\nChristopher_Cross\nSheryl_Crow\nCrowded_House\nThe_Cult\nThe_Cure\nDamn_Yankees_(band)\nDashboard_Confessional\nDaughtry_(band)\nThe_Dave_Clark_Five\nDave_Matthews_Band\nDays_of_the_New\nDeath_Cab_for_Cutie\nDeep_Purple\nDef_Leppard\nDeftones\nDepeche_Mode\nBo_Diddley\nDio_(band)\nDire_Straits\nDisturbed_(band)\nFats_Domino\nDonovan\nThe_Doobie_Brothers\nThe_Doors\nDr._Hook_&_the_Medicine_Show\nDropkick_Murphys\nDrowning_Pool\nDuran_Duran\nIan_Dury\nBob_Dylan\nEagles_(band)\nEcho_&_the_Bunnymen\nDuane_Eddy\nEdgar_Winter\nElectric_Light_Orchestra\nEmerson,_Lake_&_Palmer\nEngland_Dan_&_John_Ford_Coley\nMelissa_Etheridge\nEurope_(band)\nEvanescence\nEverclear_(band)\nEverlast\nThe_Everly_Brothers\nExtreme_(band)\nFaces_(band)\nFaith_No_More\nFall_Out_Boy\nBryan_Ferry\nFilter_(band)\nFinger_Eleven\nFireHouse\nFive_Finger_Death_Punch\nFive_for_Fighting\nThe_Fixx\nThe_Flaming_Lips\nFleetwood_Mac\nFlogging_Molly\nFlorence_and_the_Machine\nFlyleaf_(band)\nFoals_(band)\nDan_Fogelberg\nJohn_Fogerty\nFoo_Fighters\nForeigner_(band)\nFoster_the_People\nThe_Four_Seasons_(band)\nPeter_Frampton\nFranz_Ferdinand_(band)\nThe_Fray\nGlenn_Frey\nFuel_(band)\nFun_(band)\nPeter_Gabriel\nGarbage_(band)\nGenesis_(band)\nGhost_(Swedish_band)\nGin_Blossoms\nGary_Glitter\nThe_Go-Go's\nGodsmack\nGolden_Earring\nGoo_Goo_Dolls\nGood_Charlotte\nGrand_Funk_Railroad\nGrateful_Dead\nGreat_White\nGreen_Day\nGreta_Van_Fleet\nThe_Guess_Who\nGuns_N'_Roses\nHalestorm\nBill_Haley_&_His_Comets\nHall_&_Oates\nGeorge_Harrison\nHeart_(band)\nJimi_Hendrix\nDon_Henley\nHerman's_Hermits\nHighly_Suspect\nHinder\nThe_Hives\nHole_(band)\nThe_Hollies\nBuddy_Holly\nHoobastank\nHootie_&_the_Blowfish\nIcehouse_(band)\nBilly_Idol\nImagine_Dragons\nIncubus_(band)\nInterpol_(band)\nINXS\nIron_Maiden\nThe_J._Geils_Band\nThe_Jam\nTommy_James_and_the_Shondells\nJane's_Addiction\nJefferson_Airplane\nJefferson_Starship\nThe_Jesus_and_Mary_Chain\nJet_(Australian_band)\nJethro_Tull_(band)\nJoan_Jett\nJimmy_Eat_World\nBilly_Joel\nElton_John\nJanis_Joplin\nJourney_(band)\nJoy_Division\nJudas_Priest\nKaiser_Chiefs\nKaleo_(band)\nKansas_(band)\nKeane_(band)\nKid_Rock\nThe_Killers\nKillswitch_Engage\nKings_of_Leon\nThe_Kinks\nKiss_(band)\nKorn\nLenny_Kravitz\nLacuna_Coil\nLamb_of_God_(band)\nAvril_Lavigne\nLed_Zeppelin\nJohn_Lennon\nHuey_Lewis_and_the_News\nJerry_Lee_Lewis\nLifehouse_(band)\nLimp_Bizkit\nLinkin_Park\nLittle_Richard\nLittle_River_Band\nLive_(band)\nLiving_Colour\nKenny_Loggins\nLoverboy\nThe_Lovin'_Spoonful\nThe_Lumineers\nLynyrd_Skynyrd\nThe_Mamas_&_the_Papas\nMarilyn_Manson\nThe_Marshall_Tucker_Band\nMatchbox_Twenty\nJohn_Mayer\nPaul_McCartney\nMeat_Loaf\nMegadeth\nJohn_Mellencamp\nMen_at_Work\nMetallica\nMidnight_Oil\nMike_and_the_Mechanics\nModest_Mouse\nEddie_Money\nThe_Monkees\nThe_Moody_Blues\nAlanis_Morissette\nVan_Morrison\nMorrissey\nMötley_Crüe\nMotörhead\nMudvayne\nMumford_&_Sons\nMuse_(band)\nMy_Chemical_Romance\nNickelback\nStevie_Nicks\nHarry_Nilsson\nNine_Inch_Nails\nNirvana_(band)\nNo_Doubt\nTed_Nugent\nOasis_(band)\nThe_Offspring\nRoy_Orbison\nOzzy_Osbourne\nOur_Lady_Peace\nThe_Outfield\nP.O.D.\nPanic!_at_the_Disco\nPantera\nPapa_Roach\nParamore\nPearl_Jam\nA_Perfect_Circle\nTom_Petty_and_the_Heartbreakers\nPink_Floyd\nPixies_(band)\nRobert_Plant\nPoison_(American_band)\nThe_Police\nIggy_Pop\nPop_Evil\nThe_Presidents_of_the_United_States_of_America_(band)\nThe_Pretenders\nElvis_Presley\nThe_Pretty_Reckless\nPrimus_(band)\nPuddle_of_Mudd\nQueen_(band)\nQueens_of_the_Stone_Age\nQueensrÿche\nQuiet_Riot\nR.E.M.\nRadiohead\nRage_Against_the_Machine\nRainbow_(rock_band)\nRammstein\nRamones\nRed_Hot_Chili_Peppers\nLou_Reed\nREO_Speedwagon\nRise_Against\nThe_Rolling_Stones\nLinda_Ronstadt\nRoxy_Music\nRoyal_Blood_(band)\nRush_(band)\nSaliva_(band)\nSam_Fender\nSantana_(band)\nJoe_Satriani\nSaving_Abel\nScorpions_(band)\nThe_Script\nSeether\nBob_Seger\nSepultura\nSex_Pistols\nShakin'_Stevens\nShinedown\nSilverchair\nSimon_&_Garfunkel\nSimple_Minds\nSimple_Plan\nSkid_Row_(American_band)\nSkillet_(band)\nSlade\nSlayer\nSlipknot_(band)\nSmall_Faces\nSmash_Mouth\nThe_Smashing_Pumpkins\nThe_Smiths\nSmokie_(band)\nSnow_Patrol\nSocial_Distortion\nSoundgarden\nBruce_Springsteen\nBilly_Squier\nStaind\nRingo_Starr\nStarset\nStarship_(band)\nStatus_Quo_(band)\nSteely_Dan\nSteppenwolf_(band)\nSteve_Miller_Band\nRod_Stewart\nSting_(musician)\nThe_Stone_Roses\nStone_Sour\nStone_Temple_Pilots\nThe_Strokes\nStyx_(band)\nSublime_(band)\nSum_41\nSupertramp\nSurvivor_(band)\nThe_Sweet\nSystem_of_a_Down\nT._Rex_(band)\nTalking_Heads\nJames_Taylor\nTenacious_D\nTesla_(band)\nTheory_of_a_Deadman\nThin_Lizzy\nThird_Eye_Blind\nThirty_Seconds_to_Mars\nGeorge_Thorogood\nThousand_Foot_Krutch\nThree_Days_Grace\nThree_Dog_Night\nTool_(band)\nToto_(band)\nTraffic_(band)\nThe_Tragically_Hip\nTrain_(band)\nTraveling_Wilburys\nTravis_(band)\nTrivium_(band)\nTwenty_One_Pilots\nTwisted_Sister\nU2\nUriah_Heep_(band)\nThe_Used\nSteve_Vai\nRitchie_Valens\nVampire_Weekend\nVan_Halen\nStevie_Ray_Vaughan\nVelvet_Revolver\nThe_Velvet_Underground\nThe_Verve\nVolbeat\nJoe_Walsh\nWarrant_(American_band)\nWeezer\nJack_White\nThe_White_Stripes\nWhite_Zombie_(band)\nWhitesnake\nThe_Who\nPaul_McCartney_and_Wings\nSteve_Winwood\nThe_Yardbirds\nYes_(band)\nNeil_Young\nFrank_Zappa\nRob_Zombie\nThe_Zombies\nZZ_Top\"\"\".split('\\n')","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:34:24.681697Z","iopub.execute_input":"2022-08-20T23:34:24.682223Z","iopub.status.idle":"2022-08-20T23:34:24.693942Z","shell.execute_reply.started":"2022-08-20T23:34:24.682178Z","shell.execute_reply":"2022-08-20T23:34:24.693004Z"},"trusted":true},"execution_count":54,"outputs":[]},{"cell_type":"code","source":"for i,raw_title in enumerate(pages_titles):\n if i%10==0:\n print(i/len(pages_titles)*100)\n try:\n page=wikipedia.page(title=raw_title.replace('_', ' '), auto_suggest=False)\n id_ = page.pageid\n url= page.url\n dic={'content': page.content,\n 'meta':{'name': page.title,\n 'url': url}}\n\n \n with open(f'/kaggle/working/rock_wiki/{id_}.json','w') as fo:\n json.dump(dic, fo)\n except Exception as e:\n traceback.print_exc()\n print(raw_title)\n ","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:34:49.157641Z","iopub.execute_input":"2022-08-20T23:34:49.158086Z","iopub.status.idle":"2022-08-20T23:44:29.346317Z","shell.execute_reply.started":"2022-08-20T23:34:49.158047Z","shell.execute_reply":"2022-08-20T23:44:29.345032Z"},"trusted":true},"execution_count":57,"outputs":[]},{"cell_type":"code","source":"! tar -czvf rock_wiki.tar.gz ./rock_wiki","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:50:44.643851Z","iopub.execute_input":"2022-08-20T23:50:44.644378Z","iopub.status.idle":"2022-08-20T23:50:44.650366Z","shell.execute_reply.started":"2022-08-20T23:50:44.644328Z","shell.execute_reply":"2022-08-20T23:50:44.649169Z"},"trusted":true},"execution_count":60,"outputs":[]}]}
\ No newline at end of file
+{
+ "metadata": {
+ "kernelspec": {
+ "language": "python",
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.7.12",
+ "mimetype": "text/x-python",
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "pygments_lexer": "ipython3",
+ "nbconvert_exporter": "python",
+ "file_extension": ".py"
+ }
+ },
+ "nbformat_minor": 4,
+ "nbformat": 4,
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": "# Download data from Wikipedia",
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": "# install wikipedia API python wrapper\n! pip install wikipedia",
+ "metadata": {
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+ "execution": {
+ "iopub.status.busy": "2022-08-20T21:43:59.293655Z",
+ "iopub.execute_input": "2022-08-20T21:43:59.294792Z",
+ "iopub.status.idle": "2022-08-20T21:44:15.263363Z",
+ "shell.execute_reply.started": "2022-08-20T21:43:59.294746Z",
+ "shell.execute_reply": "2022-08-20T21:44:15.262171Z"
+ },
+ "trusted": true
+ },
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": "import wikipedia\nimport json\nimport traceback",
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-08-20T21:44:15.265341Z",
+ "iopub.execute_input": "2022-08-20T21:44:15.265753Z",
+ "iopub.status.idle": "2022-08-20T21:44:15.470330Z",
+ "shell.execute_reply.started": "2022-08-20T21:44:15.265709Z",
+ "shell.execute_reply": "2022-08-20T21:44:15.468665Z"
+ },
+ "trusted": true
+ },
+ "execution_count": 4,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# titles to download, from https://en.wikipedia.org/wiki/List_of_mainstream_rock_performers\n",
+ "\n",
+ "pages_titles = \"\"\"10cc\n",
+ "10_Years_(band)\n",
+ "3_Doors_Down\n",
+ "311_(band)\n",
+ "38_Special_(band)\n",
+ "Accept_(band)\n",
+ "AC/DC\n",
+ "Bryan_Adams\n",
+ "Aerosmith\n",
+ "AFI_(band)\n",
+ "Air_Supply\n",
+ "The_Alan_Parsons_Project\n",
+ "Alice_in_Chains\n",
+ "The_All-American_Rejects\n",
+ "The_Allman_Brothers_Band\n",
+ "Alter_Bridge\n",
+ "Ambrosia_(band)\n",
+ "America_(band)\n",
+ "The_Animals\n",
+ "Adam_Ant\n",
+ "Anthrax_(American_band)\n",
+ "April_Wine\n",
+ "Arcade_Fire\n",
+ "Arctic_Monkeys\n",
+ "Asia_(band)\n",
+ "Audioslave\n",
+ "Avenged_Sevenfold\n",
+ "Awolnation\n",
+ "The_B-52's\n",
+ "Bachman–Turner_Overdrive\n",
+ "Bad_Company\n",
+ "Badfinger\n",
+ "The_Band\n",
+ "The_Bangles\n",
+ "Barenaked_Ladies\n",
+ "Bay_City_Rollers\n",
+ "The_Beach_Boys\n",
+ "The_Beatles\n",
+ "Beck\n",
+ "Ben_Folds_Five\n",
+ "Pat_Benatar\n",
+ "Chuck_Berry\n",
+ "The_Big_Bopper\n",
+ "Billy_Talent\n",
+ "The_Black_Crowes\n",
+ "The_Black_Keys\n",
+ "Black_Sabbath\n",
+ "Black_Stone_Cherry\n",
+ "Black_Veil_Brides\n",
+ "Blink-182\n",
+ "Bloodhound_Gang\n",
+ "Blue_October\n",
+ "Blue_Ă–yster_Cult\n",
+ "Blues_Traveler\n",
+ "James_Blunt\n",
+ "Blur_(band)\n",
+ "Bon_Jovi\n",
+ "Boston_(band)\n",
+ "David_Bowie\n",
+ "Bowling_for_Soup\n",
+ "Boys_Like_Girls\n",
+ "Bread_(band)\n",
+ "Breaking_Benjamin\n",
+ "Bring_Me_the_Horizon\n",
+ "Jackson_Browne\n",
+ "Buckcherry\n",
+ "Jeff_Buckley\n",
+ "Bullet_for_My_Valentine\n",
+ "Bush_(British_band)\n",
+ "The_Byrds\n",
+ "Cage_the_Elephant\n",
+ "Cake_(band)\n",
+ "Canned_Heat\n",
+ "The_Cab\n",
+ "The_Cardigans\n",
+ "The_Cars\n",
+ "Catfish_and_the_Bottlemen\n",
+ "Harry_Chapin\n",
+ "Tracy_Chapman\n",
+ "Cheap_Trick\n",
+ "Chevelle_(band)\n",
+ "Chicago_(band)\n",
+ "Chubby_Checker\n",
+ "Cinderella_(band)\n",
+ "Dallas_Green_(musician)\n",
+ "Eric_Clapton\n",
+ "The_Clash\n",
+ "Eddie_Cochran\n",
+ "Joe_Cocker\n",
+ "Coheed_and_Cambria\n",
+ "Cold_Chisel\n",
+ "Coldplay\n",
+ "Collective_Soul\n",
+ "Phil_Collins\n",
+ "Alice_Cooper\n",
+ "Chris_Cornell\n",
+ "Elvis_Costello\n",
+ "Counting_Crows\n",
+ "The_Cranberries\n",
+ "Crash_Test_Dummies\n",
+ "Cream_(band)\n",
+ "Creed_(band)\n",
+ "Creedence_Clearwater_Revival\n",
+ "Jim_Croce\n",
+ "Crosby,_Stills,_Nash_&_Young\n",
+ "Christopher_Cross\n",
+ "Sheryl_Crow\n",
+ "Crowded_House\n",
+ "The_Cult\n",
+ "The_Cure\n",
+ "Damn_Yankees_(band)\n",
+ "Dashboard_Confessional\n",
+ "Daughtry_(band)\n",
+ "The_Dave_Clark_Five\n",
+ "Dave_Matthews_Band\n",
+ "Days_of_the_New\n",
+ "Death_Cab_for_Cutie\n",
+ "Deep_Purple\n",
+ "Def_Leppard\n",
+ "Deftones\n",
+ "Depeche_Mode\n",
+ "Bo_Diddley\n",
+ "Dio_(band)\n",
+ "Dire_Straits\n",
+ "Disturbed_(band)\n",
+ "Fats_Domino\n",
+ "Donovan\n",
+ "The_Doobie_Brothers\n",
+ "The_Doors\n",
+ "Dr._Hook_&_the_Medicine_Show\n",
+ "Dropkick_Murphys\n",
+ "Drowning_Pool\n",
+ "Duran_Duran\n",
+ "Ian_Dury\n",
+ "Bob_Dylan\n",
+ "Eagles_(band)\n",
+ "Echo_&_the_Bunnymen\n",
+ "Duane_Eddy\n",
+ "Edgar_Winter\n",
+ "Electric_Light_Orchestra\n",
+ "Emerson,_Lake_&_Palmer\n",
+ "England_Dan_&_John_Ford_Coley\n",
+ "Melissa_Etheridge\n",
+ "Europe_(band)\n",
+ "Evanescence\n",
+ "Everclear_(band)\n",
+ "Everlast\n",
+ "The_Everly_Brothers\n",
+ "Extreme_(band)\n",
+ "Faces_(band)\n",
+ "Faith_No_More\n",
+ "Fall_Out_Boy\n",
+ "Bryan_Ferry\n",
+ "Filter_(band)\n",
+ "Finger_Eleven\n",
+ "FireHouse\n",
+ "Five_Finger_Death_Punch\n",
+ "Five_for_Fighting\n",
+ "The_Fixx\n",
+ "The_Flaming_Lips\n",
+ "Fleetwood_Mac\n",
+ "Flogging_Molly\n",
+ "Florence_and_the_Machine\n",
+ "Flyleaf_(band)\n",
+ "Foals_(band)\n",
+ "Dan_Fogelberg\n",
+ "John_Fogerty\n",
+ "Foo_Fighters\n",
+ "Foreigner_(band)\n",
+ "Foster_the_People\n",
+ "The_Four_Seasons_(band)\n",
+ "Peter_Frampton\n",
+ "Franz_Ferdinand_(band)\n",
+ "The_Fray\n",
+ "Glenn_Frey\n",
+ "Fuel_(band)\n",
+ "Fun_(band)\n",
+ "Peter_Gabriel\n",
+ "Garbage_(band)\n",
+ "Genesis_(band)\n",
+ "Ghost_(Swedish_band)\n",
+ "Gin_Blossoms\n",
+ "Gary_Glitter\n",
+ "The_Go-Go's\n",
+ "Godsmack\n",
+ "Golden_Earring\n",
+ "Goo_Goo_Dolls\n",
+ "Good_Charlotte\n",
+ "Grand_Funk_Railroad\n",
+ "Grateful_Dead\n",
+ "Great_White\n",
+ "Green_Day\n",
+ "Greta_Van_Fleet\n",
+ "The_Guess_Who\n",
+ "Guns_N'_Roses\n",
+ "Halestorm\n",
+ "Bill_Haley_&_His_Comets\n",
+ "Hall_&_Oates\n",
+ "George_Harrison\n",
+ "Heart_(band)\n",
+ "Jimi_Hendrix\n",
+ "Don_Henley\n",
+ "Herman's_Hermits\n",
+ "Highly_Suspect\n",
+ "Hinder\n",
+ "The_Hives\n",
+ "Hole_(band)\n",
+ "The_Hollies\n",
+ "Buddy_Holly\n",
+ "Hoobastank\n",
+ "Hootie_&_the_Blowfish\n",
+ "Icehouse_(band)\n",
+ "Billy_Idol\n",
+ "Imagine_Dragons\n",
+ "Incubus_(band)\n",
+ "Interpol_(band)\n",
+ "INXS\n",
+ "Iron_Maiden\n",
+ "The_J._Geils_Band\n",
+ "The_Jam\n",
+ "Tommy_James_and_the_Shondells\n",
+ "Jane's_Addiction\n",
+ "Jefferson_Airplane\n",
+ "Jefferson_Starship\n",
+ "The_Jesus_and_Mary_Chain\n",
+ "Jet_(Australian_band)\n",
+ "Jethro_Tull_(band)\n",
+ "Joan_Jett\n",
+ "Jimmy_Eat_World\n",
+ "Billy_Joel\n",
+ "Elton_John\n",
+ "Janis_Joplin\n",
+ "Journey_(band)\n",
+ "Joy_Division\n",
+ "Judas_Priest\n",
+ "Kaiser_Chiefs\n",
+ "Kaleo_(band)\n",
+ "Kansas_(band)\n",
+ "Keane_(band)\n",
+ "Kid_Rock\n",
+ "The_Killers\n",
+ "Killswitch_Engage\n",
+ "Kings_of_Leon\n",
+ "The_Kinks\n",
+ "Kiss_(band)\n",
+ "Korn\n",
+ "Lenny_Kravitz\n",
+ "Lacuna_Coil\n",
+ "Lamb_of_God_(band)\n",
+ "Avril_Lavigne\n",
+ "Led_Zeppelin\n",
+ "John_Lennon\n",
+ "Huey_Lewis_and_the_News\n",
+ "Jerry_Lee_Lewis\n",
+ "Lifehouse_(band)\n",
+ "Limp_Bizkit\n",
+ "Linkin_Park\n",
+ "Little_Richard\n",
+ "Little_River_Band\n",
+ "Live_(band)\n",
+ "Living_Colour\n",
+ "Kenny_Loggins\n",
+ "Loverboy\n",
+ "The_Lovin'_Spoonful\n",
+ "The_Lumineers\n",
+ "Lynyrd_Skynyrd\n",
+ "The_Mamas_&_the_Papas\n",
+ "Marilyn_Manson\n",
+ "The_Marshall_Tucker_Band\n",
+ "Matchbox_Twenty\n",
+ "John_Mayer\n",
+ "Paul_McCartney\n",
+ "Meat_Loaf\n",
+ "Megadeth\n",
+ "John_Mellencamp\n",
+ "Men_at_Work\n",
+ "Metallica\n",
+ "Midnight_Oil\n",
+ "Mike_and_the_Mechanics\n",
+ "Modest_Mouse\n",
+ "Eddie_Money\n",
+ "The_Monkees\n",
+ "The_Moody_Blues\n",
+ "Alanis_Morissette\n",
+ "Van_Morrison\n",
+ "Morrissey\n",
+ "Mötley_Crüe\n",
+ "Motörhead\n",
+ "Mudvayne\n",
+ "Mumford_&_Sons\n",
+ "Muse_(band)\n",
+ "My_Chemical_Romance\n",
+ "Nickelback\n",
+ "Stevie_Nicks\n",
+ "Harry_Nilsson\n",
+ "Nine_Inch_Nails\n",
+ "Nirvana_(band)\n",
+ "No_Doubt\n",
+ "Ted_Nugent\n",
+ "Oasis_(band)\n",
+ "The_Offspring\n",
+ "Roy_Orbison\n",
+ "Ozzy_Osbourne\n",
+ "Our_Lady_Peace\n",
+ "The_Outfield\n",
+ "P.O.D.\n",
+ "Panic!_at_the_Disco\n",
+ "Pantera\n",
+ "Papa_Roach\n",
+ "Paramore\n",
+ "Pearl_Jam\n",
+ "A_Perfect_Circle\n",
+ "Tom_Petty_and_the_Heartbreakers\n",
+ "Pink_Floyd\n",
+ "Pixies_(band)\n",
+ "Robert_Plant\n",
+ "Poison_(American_band)\n",
+ "The_Police\n",
+ "Iggy_Pop\n",
+ "Pop_Evil\n",
+ "The_Presidents_of_the_United_States_of_America_(band)\n",
+ "The_Pretenders\n",
+ "Elvis_Presley\n",
+ "The_Pretty_Reckless\n",
+ "Primus_(band)\n",
+ "Puddle_of_Mudd\n",
+ "Queen_(band)\n",
+ "Queens_of_the_Stone_Age\n",
+ "QueensrĂżche\n",
+ "Quiet_Riot\n",
+ "R.E.M.\n",
+ "Radiohead\n",
+ "Rage_Against_the_Machine\n",
+ "Rainbow_(rock_band)\n",
+ "Rammstein\n",
+ "Ramones\n",
+ "Red_Hot_Chili_Peppers\n",
+ "Lou_Reed\n",
+ "REO_Speedwagon\n",
+ "Rise_Against\n",
+ "The_Rolling_Stones\n",
+ "Linda_Ronstadt\n",
+ "Roxy_Music\n",
+ "Royal_Blood_(band)\n",
+ "Rush_(band)\n",
+ "Saliva_(band)\n",
+ "Sam_Fender\n",
+ "Santana_(band)\n",
+ "Joe_Satriani\n",
+ "Saving_Abel\n",
+ "Scorpions_(band)\n",
+ "The_Script\n",
+ "Seether\n",
+ "Bob_Seger\n",
+ "Sepultura\n",
+ "Sex_Pistols\n",
+ "Shakin'_Stevens\n",
+ "Shinedown\n",
+ "Silverchair\n",
+ "Simon_&_Garfunkel\n",
+ "Simple_Minds\n",
+ "Simple_Plan\n",
+ "Skid_Row_(American_band)\n",
+ "Skillet_(band)\n",
+ "Slade\n",
+ "Slayer\n",
+ "Slipknot_(band)\n",
+ "Small_Faces\n",
+ "Smash_Mouth\n",
+ "The_Smashing_Pumpkins\n",
+ "The_Smiths\n",
+ "Smokie_(band)\n",
+ "Snow_Patrol\n",
+ "Social_Distortion\n",
+ "Soundgarden\n",
+ "Bruce_Springsteen\n",
+ "Billy_Squier\n",
+ "Staind\n",
+ "Ringo_Starr\n",
+ "Starset\n",
+ "Starship_(band)\n",
+ "Status_Quo_(band)\n",
+ "Steely_Dan\n",
+ "Steppenwolf_(band)\n",
+ "Steve_Miller_Band\n",
+ "Rod_Stewart\n",
+ "Sting_(musician)\n",
+ "The_Stone_Roses\n",
+ "Stone_Sour\n",
+ "Stone_Temple_Pilots\n",
+ "The_Strokes\n",
+ "Styx_(band)\n",
+ "Sublime_(band)\n",
+ "Sum_41\n",
+ "Supertramp\n",
+ "Survivor_(band)\n",
+ "The_Sweet\n",
+ "System_of_a_Down\n",
+ "T._Rex_(band)\n",
+ "Talking_Heads\n",
+ "James_Taylor\n",
+ "Tenacious_D\n",
+ "Tesla_(band)\n",
+ "Theory_of_a_Deadman\n",
+ "Thin_Lizzy\n",
+ "Third_Eye_Blind\n",
+ "Thirty_Seconds_to_Mars\n",
+ "George_Thorogood\n",
+ "Thousand_Foot_Krutch\n",
+ "Three_Days_Grace\n",
+ "Three_Dog_Night\n",
+ "Tool_(band)\n",
+ "Toto_(band)\n",
+ "Traffic_(band)\n",
+ "The_Tragically_Hip\n",
+ "Train_(band)\n",
+ "Traveling_Wilburys\n",
+ "Travis_(band)\n",
+ "Trivium_(band)\n",
+ "Twenty_One_Pilots\n",
+ "Twisted_Sister\n",
+ "U2\n",
+ "Uriah_Heep_(band)\n",
+ "The_Used\n",
+ "Steve_Vai\n",
+ "Ritchie_Valens\n",
+ "Vampire_Weekend\n",
+ "Van_Halen\n",
+ "Stevie_Ray_Vaughan\n",
+ "Velvet_Revolver\n",
+ "The_Velvet_Underground\n",
+ "The_Verve\n",
+ "Volbeat\n",
+ "Joe_Walsh\n",
+ "Warrant_(American_band)\n",
+ "Weezer\n",
+ "Jack_White\n",
+ "The_White_Stripes\n",
+ "White_Zombie_(band)\n",
+ "Whitesnake\n",
+ "The_Who\n",
+ "Paul_McCartney_and_Wings\n",
+ "Steve_Winwood\n",
+ "The_Yardbirds\n",
+ "Yes_(band)\n",
+ "Neil_Young\n",
+ "Frank_Zappa\n",
+ "Rob_Zombie\n",
+ "The_Zombies\n",
+ "ZZ_Top\"\"\".split(\n",
+ " \"\\n\"\n",
+ ")"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-08-20T23:34:24.681697Z",
+ "iopub.execute_input": "2022-08-20T23:34:24.682223Z",
+ "iopub.status.idle": "2022-08-20T23:34:24.693942Z",
+ "shell.execute_reply.started": "2022-08-20T23:34:24.682178Z",
+ "shell.execute_reply": "2022-08-20T23:34:24.693004Z"
+ },
+ "trusted": true
+ },
+ "execution_count": 54,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "for i, raw_title in enumerate(pages_titles):\n",
+ " if i % 10 == 0:\n",
+ " print(i / len(pages_titles) * 100)\n",
+ " try:\n",
+ " page = wikipedia.page(title=raw_title.replace(\"_\", \" \"), auto_suggest=False)\n",
+ " id_ = page.pageid\n",
+ " url = page.url\n",
+ " dic = {\"content\": page.content, \"meta\": {\"name\": page.title, \"url\": url}}\n",
+ "\n",
+ " with open(f\"/kaggle/working/rock_wiki/{id_}.json\", \"w\") as fo:\n",
+ " json.dump(dic, fo)\n",
+ " except Exception as e:\n",
+ " traceback.print_exc()\n",
+ " print(raw_title)"
+ ],
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-08-20T23:34:49.157641Z",
+ "iopub.execute_input": "2022-08-20T23:34:49.158086Z",
+ "iopub.status.idle": "2022-08-20T23:44:29.346317Z",
+ "shell.execute_reply.started": "2022-08-20T23:34:49.158047Z",
+ "shell.execute_reply": "2022-08-20T23:44:29.345032Z"
+ },
+ "trusted": true
+ },
+ "execution_count": 57,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": "! tar -czvf rock_wiki.tar.gz ./rock_wiki",
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2022-08-20T23:50:44.643851Z",
+ "iopub.execute_input": "2022-08-20T23:50:44.644378Z",
+ "iopub.status.idle": "2022-08-20T23:50:44.650366Z",
+ "shell.execute_reply.started": "2022-08-20T23:50:44.644328Z",
+ "shell.execute_reply": "2022-08-20T23:50:44.649169Z"
+ },
+ "trusted": true
+ },
+ "execution_count": 60,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/notebooks/indexing.ipynb b/notebooks/indexing.ipynb
index b5918f5..24fa2d5 100644
--- a/notebooks/indexing.ipynb
+++ b/notebooks/indexing.ipynb
@@ -1 +1,417 @@
-{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Indexing\n","Using [Haystack](https://github.com/deepset-ai/haystack), the following steps are performed:\n","- load and preprocess documents downloaded from Wikipedia\n","- create document store and write documents\n","- initialize retriever and generate document embeddings"]},{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","trusted":true},"outputs":[],"source":["! pip install farm-haystack[faiss-gpu]==1.7.0"]},{"cell_type":"markdown","metadata":{},"source":["## Load documents"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:23.692554Z","iopub.status.busy":"2022-08-21T08:23:23.692208Z","iopub.status.idle":"2022-08-21T08:23:23.700721Z","shell.execute_reply":"2022-08-21T08:23:23.698130Z","shell.execute_reply.started":"2022-08-21T08:23:23.692512Z"},"trusted":true},"outputs":[],"source":["import glob, json"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:23.707774Z","iopub.status.busy":"2022-08-21T08:23:23.704107Z","iopub.status.idle":"2022-08-21T08:23:25.026910Z","shell.execute_reply":"2022-08-21T08:23:25.025990Z","shell.execute_reply.started":"2022-08-21T08:23:23.705010Z"},"trusted":true},"outputs":[],"source":["docs=[]\n","\n","for json_file in glob.glob('../input/crawl-rock/rock_wiki/*.json'):\n"," with open(json_file, 'r') as fin:\n"," doc=json.load(fin)\n","\n"," docs.append(doc)\n"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:25.030530Z","iopub.status.busy":"2022-08-21T08:23:25.029931Z","iopub.status.idle":"2022-08-21T08:23:25.039324Z","shell.execute_reply":"2022-08-21T08:23:25.037960Z","shell.execute_reply.started":"2022-08-21T08:23:25.030491Z"},"trusted":true},"outputs":[{"data":{"text/plain":["453"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["len(docs)"]},{"cell_type":"markdown","metadata":{},"source":["## Preprocess documents"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:25.050479Z","iopub.status.busy":"2022-08-21T08:23:25.050099Z","iopub.status.idle":"2022-08-21T08:23:42.089083Z","shell.execute_reply":"2022-08-21T08:23:42.087929Z","shell.execute_reply.started":"2022-08-21T08:23:25.050446Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"108e8c46426f44e7be98a8ae930d81ce","version_major":2,"version_minor":0},"text/plain":["Preprocessing: 0%| | 0/453 [00:00, ?docs/s]"]},"metadata":{},"output_type":"display_data"}],"source":["# preprocess documents, splitting by chunks of 2 sentences\n","\n","from haystack.nodes import PreProcessor\n","\n","processor = PreProcessor(\n"," clean_empty_lines=True,\n"," clean_whitespace=True,\n"," clean_header_footer=True,\n"," split_by=\"sentence\",\n"," split_length=2,\n"," split_respect_sentence_boundary=False,\n"," split_overlap=0,\n"," language ='en'\n",")\n","preprocessed_docs = processor.process(docs)"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:42.092031Z","iopub.status.busy":"2022-08-21T08:23:42.090654Z","iopub.status.idle":"2022-08-21T08:23:42.105757Z","shell.execute_reply":"2022-08-21T08:23:42.104500Z","shell.execute_reply.started":"2022-08-21T08:23:42.091989Z"},"trusted":true},"outputs":[{"data":{"text/plain":["50024"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["len(preprocessed_docs)"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:42.108367Z","iopub.status.busy":"2022-08-21T08:23:42.107604Z","iopub.status.idle":"2022-08-21T08:23:42.117080Z","shell.execute_reply":"2022-08-21T08:23:42.115996Z","shell.execute_reply.started":"2022-08-21T08:23:42.108271Z"},"trusted":true},"outputs":[{"data":{"text/plain":["[,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ]"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["preprocessed_docs[:10]"]},{"cell_type":"markdown","metadata":{},"source":["## Create document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:42.119585Z","iopub.status.busy":"2022-08-21T08:23:42.118544Z","iopub.status.idle":"2022-08-21T08:23:42.124669Z","shell.execute_reply":"2022-08-21T08:23:42.123597Z","shell.execute_reply.started":"2022-08-21T08:23:42.119551Z"},"trusted":true},"outputs":[],"source":["from haystack.document_stores import FAISSDocumentStore\n","from haystack.nodes import EmbeddingRetriever"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:42.129562Z","iopub.status.busy":"2022-08-21T08:23:42.128772Z","iopub.status.idle":"2022-08-21T08:23:42.259879Z","shell.execute_reply":"2022-08-21T08:23:42.258950Z","shell.execute_reply.started":"2022-08-21T08:23:42.129518Z"},"trusted":true},"outputs":[],"source":["# the document store settings are those compatible with Embedding Retriever\n","document_store = FAISSDocumentStore(\n"," similarity=\"dot_product\",\n"," embedding_dim=768)"]},{"cell_type":"code","execution_count":46,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:43:25.952230Z","iopub.status.busy":"2022-08-21T08:43:25.951856Z","iopub.status.idle":"2022-08-21T08:46:12.506842Z","shell.execute_reply":"2022-08-21T08:46:12.505845Z","shell.execute_reply.started":"2022-08-21T08:43:25.952198Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"dbd72ecf0d36401ba26826f7d9a42540","version_major":2,"version_minor":0},"text/plain":["Writing Documents: 0%| | 0/50024 [00:00, ?it/s]"]},"metadata":{},"output_type":"display_data"}],"source":["# write documents\n","document_store.write_documents(preprocessed_docs)"]},{"cell_type":"markdown","metadata":{},"source":["## Initialize retriever (Embedding Retriever) and generate document embeddings\n","We choose a Sentence Tranformer model that is suitable for asymmetric semantic search (short query and longer passages), according to [documentation](https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search)."]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:56:25.360959Z","iopub.status.busy":"2022-08-21T08:56:25.360546Z","iopub.status.idle":"2022-08-21T08:58:07.214654Z","shell.execute_reply":"2022-08-21T08:58:07.213653Z","shell.execute_reply.started":"2022-08-21T08:56:25.360926Z"},"trusted":true},"outputs":[],"source":["from haystack.nodes import EmbeddingRetriever\n","\n","retriever = EmbeddingRetriever(\n"," document_store=document_store,\n"," embedding_model=\"sentence-transformers/msmarco-distilbert-base-tas-b\",\n"," model_format=\"sentence_transformers\",\n"," embed_meta_fields=['name']\n",")\n","\n","# generate embeddings\n","document_store.update_embeddings(retriever)"]},{"cell_type":"markdown","metadata":{},"source":["## Save and export index"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import shutil\n","import glob"]},{"cell_type":"code","execution_count":73,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:58:33.494417Z","iopub.status.busy":"2022-08-21T08:58:33.493822Z","iopub.status.idle":"2022-08-21T08:58:33.635915Z","shell.execute_reply":"2022-08-21T08:58:33.634599Z","shell.execute_reply.started":"2022-08-21T08:58:33.494382Z"},"trusted":true},"outputs":[],"source":["OUT_DIR = 'YOUR-OUT-DIR'\n","\n","document_store.save(\"my_faiss_index.faiss\")\n","for f in glob.glob('*faiss*.*')+glob.glob('faiss*.*'):\n"," shutil.copy(f, OUT_DIR)"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.12"}},"nbformat":4,"nbformat_minor":4}
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Indexing\n",
+ "Using [Haystack](https://github.com/deepset-ai/haystack), the following steps are performed:\n",
+ "- load and preprocess documents downloaded from Wikipedia\n",
+ "- create document store and write documents\n",
+ "- initialize retriever and generate document embeddings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "! pip install farm-haystack[faiss-gpu]==1.7.0"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load documents"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:23:23.692554Z",
+ "iopub.status.busy": "2022-08-21T08:23:23.692208Z",
+ "iopub.status.idle": "2022-08-21T08:23:23.700721Z",
+ "shell.execute_reply": "2022-08-21T08:23:23.698130Z",
+ "shell.execute_reply.started": "2022-08-21T08:23:23.692512Z"
+ },
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "import glob, json"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:23:23.707774Z",
+ "iopub.status.busy": "2022-08-21T08:23:23.704107Z",
+ "iopub.status.idle": "2022-08-21T08:23:25.026910Z",
+ "shell.execute_reply": "2022-08-21T08:23:25.025990Z",
+ "shell.execute_reply.started": "2022-08-21T08:23:23.705010Z"
+ },
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "docs = []\n",
+ "\n",
+ "for json_file in glob.glob(\"../input/crawl-rock/rock_wiki/*.json\"):\n",
+ " with open(json_file, \"r\") as fin:\n",
+ " doc = json.load(fin)\n",
+ "\n",
+ " docs.append(doc)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:23:25.030530Z",
+ "iopub.status.busy": "2022-08-21T08:23:25.029931Z",
+ "iopub.status.idle": "2022-08-21T08:23:25.039324Z",
+ "shell.execute_reply": "2022-08-21T08:23:25.037960Z",
+ "shell.execute_reply.started": "2022-08-21T08:23:25.030491Z"
+ },
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "453"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Preprocess documents"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:23:25.050479Z",
+ "iopub.status.busy": "2022-08-21T08:23:25.050099Z",
+ "iopub.status.idle": "2022-08-21T08:23:42.089083Z",
+ "shell.execute_reply": "2022-08-21T08:23:42.087929Z",
+ "shell.execute_reply.started": "2022-08-21T08:23:25.050446Z"
+ },
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "108e8c46426f44e7be98a8ae930d81ce",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Preprocessing: 0%| | 0/453 [00:00, ?docs/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# preprocess documents, splitting by chunks of 2 sentences\n",
+ "\n",
+ "from haystack.nodes import PreProcessor\n",
+ "\n",
+ "processor = PreProcessor(\n",
+ " clean_empty_lines=True,\n",
+ " clean_whitespace=True,\n",
+ " clean_header_footer=True,\n",
+ " split_by=\"sentence\",\n",
+ " split_length=2,\n",
+ " split_respect_sentence_boundary=False,\n",
+ " split_overlap=0,\n",
+ " language=\"en\",\n",
+ ")\n",
+ "preprocessed_docs = processor.process(docs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:23:42.092031Z",
+ "iopub.status.busy": "2022-08-21T08:23:42.090654Z",
+ "iopub.status.idle": "2022-08-21T08:23:42.105757Z",
+ "shell.execute_reply": "2022-08-21T08:23:42.104500Z",
+ "shell.execute_reply.started": "2022-08-21T08:23:42.091989Z"
+ },
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "50024"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(preprocessed_docs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:23:42.108367Z",
+ "iopub.status.busy": "2022-08-21T08:23:42.107604Z",
+ "iopub.status.idle": "2022-08-21T08:23:42.117080Z",
+ "shell.execute_reply": "2022-08-21T08:23:42.115996Z",
+ "shell.execute_reply.started": "2022-08-21T08:23:42.108271Z"
+ },
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessed_docs[:10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# select only documents with at least 10 words. Otherwise, the documents are not very informative\n",
+ "preprocessed_docs = [doc for doc in preprocessed_docs if len(doc.content.split()) >= 10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:23:42.119585Z",
+ "iopub.status.busy": "2022-08-21T08:23:42.118544Z",
+ "iopub.status.idle": "2022-08-21T08:23:42.124669Z",
+ "shell.execute_reply": "2022-08-21T08:23:42.123597Z",
+ "shell.execute_reply.started": "2022-08-21T08:23:42.119551Z"
+ },
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "from haystack.document_stores import FAISSDocumentStore\n",
+ "from haystack.nodes import EmbeddingRetriever"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:23:42.129562Z",
+ "iopub.status.busy": "2022-08-21T08:23:42.128772Z",
+ "iopub.status.idle": "2022-08-21T08:23:42.259879Z",
+ "shell.execute_reply": "2022-08-21T08:23:42.258950Z",
+ "shell.execute_reply.started": "2022-08-21T08:23:42.129518Z"
+ },
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "# the document store settings are those compatible with Embedding Retriever\n",
+ "document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=768)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:43:25.952230Z",
+ "iopub.status.busy": "2022-08-21T08:43:25.951856Z",
+ "iopub.status.idle": "2022-08-21T08:46:12.506842Z",
+ "shell.execute_reply": "2022-08-21T08:46:12.505845Z",
+ "shell.execute_reply.started": "2022-08-21T08:43:25.952198Z"
+ },
+ "trusted": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "dbd72ecf0d36401ba26826f7d9a42540",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Writing Documents: 0%| | 0/50024 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# write documents\n",
+ "document_store.write_documents(preprocessed_docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Initialize retriever (Embedding Retriever) and generate document embeddings\n",
+ "We choose a Sentence Tranformer model that is suitable for asymmetric semantic search (short query and longer passages), according to [documentation](https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:56:25.360959Z",
+ "iopub.status.busy": "2022-08-21T08:56:25.360546Z",
+ "iopub.status.idle": "2022-08-21T08:58:07.214654Z",
+ "shell.execute_reply": "2022-08-21T08:58:07.213653Z",
+ "shell.execute_reply.started": "2022-08-21T08:56:25.360926Z"
+ },
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "from haystack.nodes import EmbeddingRetriever\n",
+ "\n",
+ "retriever = EmbeddingRetriever(\n",
+ " document_store=document_store,\n",
+ " embedding_model=\"sentence-transformers/msmarco-distilbert-base-tas-b\",\n",
+ " model_format=\"sentence_transformers\",\n",
+ " embed_meta_fields=[\"name\"],\n",
+ ")\n",
+ "\n",
+ "# generate embeddings\n",
+ "document_store.update_embeddings(retriever)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Save and export index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import shutil\n",
+ "import glob"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2022-08-21T08:58:33.494417Z",
+ "iopub.status.busy": "2022-08-21T08:58:33.493822Z",
+ "iopub.status.idle": "2022-08-21T08:58:33.635915Z",
+ "shell.execute_reply": "2022-08-21T08:58:33.634599Z",
+ "shell.execute_reply.started": "2022-08-21T08:58:33.494382Z"
+ },
+ "trusted": true
+ },
+ "outputs": [],
+ "source": [
+ "OUT_DIR = \"YOUR-OUT-DIR\"\n",
+ "\n",
+ "document_store.save(\"my_faiss_index.faiss\")\n",
+ "for f in glob.glob(\"*faiss*.*\") + glob.glob(\"faiss*.*\"):\n",
+ " shutil.copy(f, OUT_DIR)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.7.13 ('venv': venv)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.13"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "c114177cb475e38b99e396ae1ef7cfcaaa7967120589f47745b82f90d7e35d1b"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/pages/Info.py b/pages/Info.py
index 3db2840..e5aeddb 100644
--- a/pages/Info.py
+++ b/pages/Info.py
@@ -1,3 +1 @@
import streamlit as st
-
-