diff --git a/Rock_fact_checker.py b/Rock_fact_checker.py index f50db26..2de5220 100644 --- a/Rock_fact_checker.py +++ b/Rock_fact_checker.py @@ -1,46 +1,50 @@ -import streamlit as st - +import random import time import logging from json import JSONDecodeError -# from markdown import markdown -# from annotated_text import annotation -# from urllib.parse import unquote -import random + +import streamlit as st import pandas as pd +import plotly.express as px from app_utils.backend_utils import load_statements, query -from app_utils.frontend_utils import set_state_if_absent, reset_results, entailment_html_messages +from app_utils.frontend_utils import ( + set_state_if_absent, + reset_results, + entailment_html_messages, + create_df_for_relevant_snippets, +) from app_utils.config import RETRIEVER_TOP_K def main(): - - statements = load_statements() # Persistent state - set_state_if_absent('statement', "Elvis Presley is alive") - set_state_if_absent('answer', '') - set_state_if_absent('results', None) - set_state_if_absent('raw_json', None) - set_state_if_absent('random_statement_requested', False) + set_state_if_absent("statement", "Elvis Presley is alive") + set_state_if_absent("answer", "") + set_state_if_absent("results", None) + set_state_if_absent("raw_json", None) + set_state_if_absent("random_statement_requested", False) - - ## MAIN CONTAINER st.write("# Fact checking 🎸 Rocks!") st.write() - st.markdown(""" + st.markdown( + """ ##### Enter a factual statement about [Rock music](https://en.wikipedia.org/wiki/List_of_mainstream_rock_performers) and let the AI check it out for you... - """) + """ + ) # Search bar - statement = st.text_input("", value=st.session_state.statement, - max_chars=100, on_change=reset_results) + statement = st.text_input( + "", value=st.session_state.statement, max_chars=100, on_change=reset_results + ) col1, col2 = st.columns(2) col1.markdown( - "", unsafe_allow_html=True) + "", unsafe_allow_html=True + ) col2.markdown( - "", unsafe_allow_html=True) + "", unsafe_allow_html=True + ) # Run button run_pressed = col1.button("Run") # Random statement button @@ -54,12 +58,15 @@ def main(): st.session_state.random_statement_requested = True # Re-runs the script setting the random statement as the textbox value # Unfortunately necessary as the Random statement button is _below_ the textbox - # raise st.script_runner.RerunException( - # st.script_request_queue.RerunData(None)) + # Adapted for Streamlit>=1.12 + raise st.runtime.scriptrunner.script_runner.RerunException( + st.runtime.scriptrunner.script_requests.RerunData("") + ) else: st.session_state.random_statement_requested = False - run_query = (run_pressed or statement != st.session_state.statement) \ - and not st.session_state.random_statement_requested + run_query = ( + run_pressed or statement != st.session_state.statement + ) and not st.session_state.random_statement_requested # Get results for query if run_query and statement: @@ -68,14 +75,14 @@ def main(): st.session_state.statement = statement with st.spinner("🧠    Performing neural search on documents..."): try: - st.session_state.results = query( - statement, RETRIEVER_TOP_K) + st.session_state.results = query(statement, RETRIEVER_TOP_K) time_end = time.time() print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) - print(f'elapsed time: {time_end - time_start}') + print(f"elapsed time: {time_end - time_start}") except JSONDecodeError as je: st.error( - "đź‘“    An error occurred reading the results. Is the document store working?") + "đź‘“    An error occurred reading the results. Is the document store working?" + ) return except Exception as e: logging.exception(e) @@ -85,85 +92,36 @@ def main(): # Display results if st.session_state.results: results = st.session_state.results - docs, agg_entailment_info = results['documents'], results['agg_entailment_info'] - print(results) - + docs, agg_entailment_info = results["documents"], results["agg_entailment_info"] + + # show different messages depending on entailment results max_key = max(agg_entailment_info, key=agg_entailment_info.get) message = entailment_html_messages[max_key] - st.markdown(f'

{message}

', unsafe_allow_html=True) - st.markdown(f'###### Aggregate entailment information:') - st.write(results['agg_entailment_info']) - st.markdown(f'###### Relevant snippets:') - - # colms = st.columns((2, 5, 1, 1, 1, 1)) - # fields = ["Page title",'Content', 'Relevance', 'contradiction', 'neutral', 'entailment'] - # for col, field_name in zip(colms, fields): - # # header - # col.write(field_name) - df = [] - for doc in docs: - # col1, col2, col3, col4, col5, col6 = st.columns((2, 5, 1, 1, 1, 1)) - # col1.write(f"[{doc.meta['name']}]({doc.meta['url']})") - # col2.write(f"{doc.content}") - # col3.write(f"{doc.score:.3f}") - # col4.write(f"{doc.meta['entailment_info']['contradiction']:.2f}") - # col5.write(f"{doc.meta['entailment_info']['neutral']:.2f}") - # col6.write(f"{doc.meta['entailment_info']['entailment']:.2f}") - - # 'con': f"{doc.meta['entailment_info']['contradiction']:.2f}", - # 'neu': f"{doc.meta['entailment_info']['neutral']:.2f}", - # 'ent': f"{doc.meta['entailment_info']['entailment']:.2f}", - # # 'url': doc.meta['url'], - # 'Content': doc.content} - # - # - # - row = {'Title': doc.meta['name'], - 'Relevance': f"{doc.score:.3f}", - 'con': f"{doc.meta['entailment_info']['contradiction']:.2f}", - 'neu': f"{doc.meta['entailment_info']['neutral']:.2f}", - 'ent': f"{doc.meta['entailment_info']['entailment']:.2f}", - # 'url': doc.meta['url'], - 'Content': doc.content} - df.append(row) - st.dataframe(pd.DataFrame(df))#.style.apply(highlight)) - - - # if len(st.session_state.results['answers']) == 0: - # st.info("""🤔    Haystack is unsure whether any of - # the documents contain an answer to your question. Try to reformulate it!""") - - # for result in st.session_state.results['answers']: - # result = result.to_dict() - # if result["answer"]: - # if alert_irrelevance and result['score'] < LOW_RELEVANCE_THRESHOLD: - # alert_irrelevance = False - # st.write(""" - #

Attention, the - # following answers have low relevance:

""", - # unsafe_allow_html=True) - - # answer, context = result["answer"], result["context"] - # start_idx = context.find(answer) - # end_idx = start_idx + len(answer) - # # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190 - # st.write(markdown("- ..."+context[:start_idx] + - # str(annotation(answer, "ANSWER", "#3e1c21", "white")) + - # context[end_idx:]+"..."), unsafe_allow_html=True) - # source = "" - # name = unquote(result['meta']['name']).replace('_', ' ') - # url = result['meta']['url'] - # source = f"[{name}]({url})" - # st.markdown( - # f"**Score:** {result['score']:.2f} - **Source:** {source}") - -# def make_pretty(styler): -# styler.set_caption("Weather Conditions") -# # styler.format(rain_condition) -# styler.format_con(lambda v: v.float(v)) -# styler.background_gradient(axis=None, vmin=0, vmax=1, cmap="YlGnBu") -# return styler - -def highlight(s): - return ['background-color: red']*5 -main() \ No newline at end of file + st.markdown(f"

{message}

", unsafe_allow_html=True) + + st.markdown(f"###### Aggregate entailment information:") + col1, col2 = st.columns([2, 1]) + df_agg_entailment_info = pd.DataFrame([results["agg_entailment_info"]]) + fig = px.scatter_ternary( + df_agg_entailment_info, + a="contradiction", + b="neutral", + c="entailment", + size="contradiction", + ) + with col1: + st.plotly_chart(fig, use_container_width=True) + with col2: + st.write(results["agg_entailment_info"]) + + st.markdown(f"###### Relevant snippets:") + df, urls = create_df_for_relevant_snippets(docs) + st.dataframe(df) + + str_wiki_pages = "Wikipedia source pages: " + for doc, url in urls.items(): + str_wiki_pages += f"[{doc}]({url}) " + st.markdown(str_wiki_pages) + + +main() diff --git a/app_utils/backend_utils.py b/app_utils/backend_utils.py index 17bebb1..7c6036f 100644 --- a/app_utils/backend_utils.py +++ b/app_utils/backend_utils.py @@ -1,42 +1,61 @@ import shutil + from haystack.document_stores import FAISSDocumentStore from haystack.nodes import EmbeddingRetriever from haystack.pipelines import Pipeline - import streamlit as st from app_utils.entailment_checker import EntailmentChecker +from app_utils.config import ( + STATEMENTS_PATH, + INDEX_DIR, + RETRIEVER_MODEL, + RETRIEVER_MODEL_FORMAT, + NLI_MODEL, +) + + +@st.cache() +def load_statements(): + """Load statements from file""" + with open(STATEMENTS_PATH) as fin: + statements = [ + line.strip() for line in fin.readlines() if not line.startswith("#") + ] + return statements -from app_utils.config import STATEMENTS_PATH, INDEX_DIR, RETRIEVER_MODEL, RETRIEVER_MODEL_FORMAT, NLI_MODEL # cached to make index and models load only at start -@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None}, allow_output_mutation=True) +@st.cache( + hash_funcs={"builtins.SwigPyObject": lambda _: None}, allow_output_mutation=True +) def start_haystack(): """ load document store, retriever, reader and create pipeline """ - shutil.copy(f'{INDEX_DIR}/faiss_document_store.db', '.') + shutil.copy(f"{INDEX_DIR}/faiss_document_store.db", ".") document_store = FAISSDocumentStore( - faiss_index_path=f'{INDEX_DIR}/my_faiss_index.faiss', - faiss_config_path=f'{INDEX_DIR}/my_faiss_index.json') - print(f'Index size: {document_store.get_document_count()}') - + faiss_index_path=f"{INDEX_DIR}/my_faiss_index.faiss", + faiss_config_path=f"{INDEX_DIR}/my_faiss_index.json", + ) + print(f"Index size: {document_store.get_document_count()}") + retriever = EmbeddingRetriever( document_store=document_store, embedding_model=RETRIEVER_MODEL, - model_format=RETRIEVER_MODEL_FORMAT + model_format=RETRIEVER_MODEL_FORMAT, ) - - entailment_checker = EntailmentChecker(model_name_or_path=NLI_MODEL, - use_gpu=False) - + + entailment_checker = EntailmentChecker(model_name_or_path=NLI_MODEL, use_gpu=False) pipe = Pipeline() pipe.add_node(component=retriever, name="retriever", inputs=["Query"]) pipe.add_node(component=entailment_checker, name="ec", inputs=["retriever"]) return pipe + pipe = start_haystack() + # the pipeline is not included as parameter of the following function, # because it is difficult to cache @st.cache(persist=True, allow_output_mutation=True) @@ -45,28 +64,28 @@ def query(statement: str, retriever_top_k: int = 5): params = {"retriever": {"top_k": retriever_top_k}} results = pipe.run(statement, params=params) - scores, agg_con, agg_neu, agg_ent = 0,0,0,0 - for doc in results['documents']: - scores+=doc.score - ent_info=doc.meta['entailment_info'] - con,neu,ent = ent_info['contradiction'], ent_info['neutral'], ent_info['entailment'] - agg_con+=con*doc.score - agg_neu+=neu*doc.score - agg_ent+=ent*doc.score - - results['agg_entailment_info'] = { - 'contradiction': round(agg_con/scores, 2), - 'neutral': round(agg_neu/scores, 2), - 'entailment': round(agg_ent/scores, 2)} - - return results + scores, agg_con, agg_neu, agg_ent = 0, 0, 0, 0 + for i, doc in enumerate(results["documents"]): + scores += doc.score + ent_info = doc.meta["entailment_info"] + con, neu, ent = ( + ent_info["contradiction"], + ent_info["neutral"], + ent_info["entailment"], + ) + agg_con += con * doc.score + agg_neu += neu * doc.score + agg_ent += ent * doc.score -@st.cache() -def load_statements(): - """Load statements from file""" - with open(STATEMENTS_PATH) as fin: - statements = [line.strip() for line in fin.readlines() - if not line.startswith('#')] - return statements + # if in the first 3 documents there is a strong evidence of entailment/contradiction, + # there is non need to consider less relevant documents + if i == 2 and max(agg_con, agg_ent) / scores > 0.5: + results["documents"] = results["documents"][: i + 1] + break - \ No newline at end of file + results["agg_entailment_info"] = { + "contradiction": round(agg_con / scores, 2), + "neutral": round(agg_neu / scores, 2), + "entailment": round(agg_ent / scores, 2), + } + return results diff --git a/app_utils/config.py b/app_utils/config.py index 0dece9a..0f6259b 100644 --- a/app_utils/config.py +++ b/app_utils/config.py @@ -1,6 +1,5 @@ - -INDEX_DIR = 'data/index' -STATEMENTS_PATH = 'data/statements.txt' +INDEX_DIR = "data/index" +STATEMENTS_PATH = "data/statements.txt" RETRIEVER_MODEL = "sentence-transformers/msmarco-distilbert-base-tas-b" RETRIEVER_MODEL_FORMAT = "sentence_transformers" diff --git a/app_utils/entailment_checker.py b/app_utils/entailment_checker.py index fa39b3d..8868817 100644 --- a/app_utils/entailment_checker.py +++ b/app_utils/entailment_checker.py @@ -1,11 +1,12 @@ from typing import List, Optional -from transformers import AutoModelForSequenceClassification,AutoTokenizer,AutoConfig +from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig import torch from haystack.nodes.base import BaseComponent from haystack.modeling.utils import initialize_device_settings from haystack.schema import Document, Answer, Span + class EntailmentChecker(BaseComponent): """ This node checks the entailment between every document content and the query. @@ -38,29 +39,37 @@ def __init__( tokenizer = tokenizer or model_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(tokenizer) - self.model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,revision=model_version) + self.model = AutoModelForSequenceClassification.from_pretrained( + pretrained_model_name_or_path=model_name_or_path, revision=model_version + ) self.batch_size = batch_size self.model.to(str(self.devices[0])) - + id2label = AutoConfig.from_pretrained(model_name_or_path).id2label - self.labels= [id2label[k].lower() for k in sorted(id2label)] - if 'entailment' not in self.labels: - raise ValueError("The model config must contain entailment value in the id2label dict.") - + self.labels = [id2label[k].lower() for k in sorted(id2label)] + if "entailment" not in self.labels: + raise ValueError( + "The model config must contain entailment value in the id2label dict." + ) + def run(self, query: str, documents: List[Document]): for doc in documents: - entailment_dict=self.get_entailment(premise=doc.content, hypotesis=query) - doc.meta['entailment_info']=entailment_dict - return {'documents':documents}, "output_1" - + entailment_dict = self.get_entailment(premise=doc.content, hypotesis=query) + doc.meta["entailment_info"] = entailment_dict + return {"documents": documents}, "output_1" + def run_batch(): pass - - def get_entailment(self, premise,hypotesis): + + def get_entailment(self, premise, hypotesis): with torch.no_grad(): - inputs = self.tokenizer(f'{premise}{self.tokenizer.sep_token}{hypotesis}', return_tensors="pt").to(self.devices[0]) + inputs = self.tokenizer( + f"{premise}{self.tokenizer.sep_token}{hypotesis}", return_tensors="pt" + ).to(self.devices[0]) out = self.model(**inputs) logits = out.logits - probs = torch.nn.functional.softmax(logits, dim=-1)[0,:].cpu().detach().numpy() - entailment_dict={k.lower():v for k,v in zip (self.labels, probs)} - return entailment_dict \ No newline at end of file + probs = ( + torch.nn.functional.softmax(logits, dim=-1)[0, :].cpu().detach().numpy() + ) + entailment_dict = {k.lower(): v for k, v in zip(self.labels, probs)} + return entailment_dict diff --git a/app_utils/frontend_utils.py b/app_utils/frontend_utils.py index 1061615..ffb0c9b 100644 --- a/app_utils/frontend_utils.py +++ b/app_utils/frontend_utils.py @@ -1,16 +1,45 @@ import streamlit as st +import pandas as pd + +entailment_html_messages = { + "entailment": 'The knowledge base seems to confirm your statement', + "contradiction": 'The knowledge base seems to contradict your statement', + "neutral": 'The knowledge base is neutral about your statement', +} def set_state_if_absent(key, value): if key not in st.session_state: st.session_state[key] = value + # Small callback to reset the interface in case the text of the question changes def reset_results(*args): st.session_state.answer = None st.session_state.results = None st.session_state.raw_json = None -entailment_html_messages = {'entailment': 'The knowledge base seems to confirm your statement', - 'contradiction': 'The knowledge base seems to contradict your statement', - 'neutral': 'The knowledge base is neutral about your statement'} + +def highlight_cols(s): + coldict = {"con": "#FFA07A", "neu": "#E5E4E2", "ent": "#a9d39e"} + if s.name in coldict.keys(): + return ["background-color: {}".format(coldict[s.name])] * len(s) + return [""] * len(s) + + +def create_df_for_relevant_snippets(docs): + rows = [] + urls = {} + for doc in docs: + row = { + "Title": doc.meta["name"], + "Relevance": f"{doc.score:.3f}", + "con": f"{doc.meta['entailment_info']['contradiction']:.2f}", + "neu": f"{doc.meta['entailment_info']['neutral']:.2f}", + "ent": f"{doc.meta['entailment_info']['entailment']:.2f}", + "Content": doc.content, + } + urls[doc.meta["name"]] = doc.meta["url"] + rows.append(row) + df = pd.DataFrame(rows).style.apply(highlight_cols) + return df, urls diff --git a/data/statements.txt b/data/statements.txt index ed7ae16..4a83fa6 100644 --- a/data/statements.txt +++ b/data/statements.txt @@ -17,4 +17,10 @@ Steve Vai collaborated with Frank Zappa The White Stripes were a trio The White Stripes were composed by Jack White and Meg White Scorpions is a German trap band -Sepultura is a heavy metal band \ No newline at end of file +Sepultura is a heavy metal band +Toxicity is a song by System of a down +System of a down is a Italian band +The Cure is a pop band +Mick Jagger loves pasta +Ozzy Osbourne was part of the Black Sabbath +Zucchero is an international artist \ No newline at end of file diff --git a/notebooks/get_wikipedia_data.ipynb b/notebooks/get_wikipedia_data.ipynb index bff4753..8241b26 100644 --- a/notebooks/get_wikipedia_data.ipynb +++ b/notebooks/get_wikipedia_data.ipynb @@ -1 +1,582 @@ -{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Download data from Wikipedia","metadata":{}},{"cell_type":"code","source":"# install wikipedia API python wrapper\n! pip install wikipedia","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2022-08-20T21:43:59.293655Z","iopub.execute_input":"2022-08-20T21:43:59.294792Z","iopub.status.idle":"2022-08-20T21:44:15.263363Z","shell.execute_reply.started":"2022-08-20T21:43:59.294746Z","shell.execute_reply":"2022-08-20T21:44:15.262171Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"import wikipedia\nimport json\nimport traceback","metadata":{"execution":{"iopub.status.busy":"2022-08-20T21:44:15.265341Z","iopub.execute_input":"2022-08-20T21:44:15.265753Z","iopub.status.idle":"2022-08-20T21:44:15.470330Z","shell.execute_reply.started":"2022-08-20T21:44:15.265709Z","shell.execute_reply":"2022-08-20T21:44:15.468665Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"# titles to download, from https://en.wikipedia.org/wiki/List_of_mainstream_rock_performers\n\npages_titles=\"\"\"10cc\n10_Years_(band)\n3_Doors_Down\n311_(band)\n38_Special_(band)\nAccept_(band)\nAC/DC\nBryan_Adams\nAerosmith\nAFI_(band)\nAir_Supply\nThe_Alan_Parsons_Project\nAlice_in_Chains\nThe_All-American_Rejects\nThe_Allman_Brothers_Band\nAlter_Bridge\nAmbrosia_(band)\nAmerica_(band)\nThe_Animals\nAdam_Ant\nAnthrax_(American_band)\nApril_Wine\nArcade_Fire\nArctic_Monkeys\nAsia_(band)\nAudioslave\nAvenged_Sevenfold\nAwolnation\nThe_B-52's\nBachman–Turner_Overdrive\nBad_Company\nBadfinger\nThe_Band\nThe_Bangles\nBarenaked_Ladies\nBay_City_Rollers\nThe_Beach_Boys\nThe_Beatles\nBeck\nBen_Folds_Five\nPat_Benatar\nChuck_Berry\nThe_Big_Bopper\nBilly_Talent\nThe_Black_Crowes\nThe_Black_Keys\nBlack_Sabbath\nBlack_Stone_Cherry\nBlack_Veil_Brides\nBlink-182\nBloodhound_Gang\nBlue_October\nBlue_Öyster_Cult\nBlues_Traveler\nJames_Blunt\nBlur_(band)\nBon_Jovi\nBoston_(band)\nDavid_Bowie\nBowling_for_Soup\nBoys_Like_Girls\nBread_(band)\nBreaking_Benjamin\nBring_Me_the_Horizon\nJackson_Browne\nBuckcherry\nJeff_Buckley\nBullet_for_My_Valentine\nBush_(British_band)\nThe_Byrds\nCage_the_Elephant\nCake_(band)\nCanned_Heat\nThe_Cab\nThe_Cardigans\nThe_Cars\nCatfish_and_the_Bottlemen\nHarry_Chapin\nTracy_Chapman\nCheap_Trick\nChevelle_(band)\nChicago_(band)\nChubby_Checker\nCinderella_(band)\nDallas_Green_(musician)\nEric_Clapton\nThe_Clash\nEddie_Cochran\nJoe_Cocker\nCoheed_and_Cambria\nCold_Chisel\nColdplay\nCollective_Soul\nPhil_Collins\nAlice_Cooper\nChris_Cornell\nElvis_Costello\nCounting_Crows\nThe_Cranberries\nCrash_Test_Dummies\nCream_(band)\nCreed_(band)\nCreedence_Clearwater_Revival\nJim_Croce\nCrosby,_Stills,_Nash_&_Young\nChristopher_Cross\nSheryl_Crow\nCrowded_House\nThe_Cult\nThe_Cure\nDamn_Yankees_(band)\nDashboard_Confessional\nDaughtry_(band)\nThe_Dave_Clark_Five\nDave_Matthews_Band\nDays_of_the_New\nDeath_Cab_for_Cutie\nDeep_Purple\nDef_Leppard\nDeftones\nDepeche_Mode\nBo_Diddley\nDio_(band)\nDire_Straits\nDisturbed_(band)\nFats_Domino\nDonovan\nThe_Doobie_Brothers\nThe_Doors\nDr._Hook_&_the_Medicine_Show\nDropkick_Murphys\nDrowning_Pool\nDuran_Duran\nIan_Dury\nBob_Dylan\nEagles_(band)\nEcho_&_the_Bunnymen\nDuane_Eddy\nEdgar_Winter\nElectric_Light_Orchestra\nEmerson,_Lake_&_Palmer\nEngland_Dan_&_John_Ford_Coley\nMelissa_Etheridge\nEurope_(band)\nEvanescence\nEverclear_(band)\nEverlast\nThe_Everly_Brothers\nExtreme_(band)\nFaces_(band)\nFaith_No_More\nFall_Out_Boy\nBryan_Ferry\nFilter_(band)\nFinger_Eleven\nFireHouse\nFive_Finger_Death_Punch\nFive_for_Fighting\nThe_Fixx\nThe_Flaming_Lips\nFleetwood_Mac\nFlogging_Molly\nFlorence_and_the_Machine\nFlyleaf_(band)\nFoals_(band)\nDan_Fogelberg\nJohn_Fogerty\nFoo_Fighters\nForeigner_(band)\nFoster_the_People\nThe_Four_Seasons_(band)\nPeter_Frampton\nFranz_Ferdinand_(band)\nThe_Fray\nGlenn_Frey\nFuel_(band)\nFun_(band)\nPeter_Gabriel\nGarbage_(band)\nGenesis_(band)\nGhost_(Swedish_band)\nGin_Blossoms\nGary_Glitter\nThe_Go-Go's\nGodsmack\nGolden_Earring\nGoo_Goo_Dolls\nGood_Charlotte\nGrand_Funk_Railroad\nGrateful_Dead\nGreat_White\nGreen_Day\nGreta_Van_Fleet\nThe_Guess_Who\nGuns_N'_Roses\nHalestorm\nBill_Haley_&_His_Comets\nHall_&_Oates\nGeorge_Harrison\nHeart_(band)\nJimi_Hendrix\nDon_Henley\nHerman's_Hermits\nHighly_Suspect\nHinder\nThe_Hives\nHole_(band)\nThe_Hollies\nBuddy_Holly\nHoobastank\nHootie_&_the_Blowfish\nIcehouse_(band)\nBilly_Idol\nImagine_Dragons\nIncubus_(band)\nInterpol_(band)\nINXS\nIron_Maiden\nThe_J._Geils_Band\nThe_Jam\nTommy_James_and_the_Shondells\nJane's_Addiction\nJefferson_Airplane\nJefferson_Starship\nThe_Jesus_and_Mary_Chain\nJet_(Australian_band)\nJethro_Tull_(band)\nJoan_Jett\nJimmy_Eat_World\nBilly_Joel\nElton_John\nJanis_Joplin\nJourney_(band)\nJoy_Division\nJudas_Priest\nKaiser_Chiefs\nKaleo_(band)\nKansas_(band)\nKeane_(band)\nKid_Rock\nThe_Killers\nKillswitch_Engage\nKings_of_Leon\nThe_Kinks\nKiss_(band)\nKorn\nLenny_Kravitz\nLacuna_Coil\nLamb_of_God_(band)\nAvril_Lavigne\nLed_Zeppelin\nJohn_Lennon\nHuey_Lewis_and_the_News\nJerry_Lee_Lewis\nLifehouse_(band)\nLimp_Bizkit\nLinkin_Park\nLittle_Richard\nLittle_River_Band\nLive_(band)\nLiving_Colour\nKenny_Loggins\nLoverboy\nThe_Lovin'_Spoonful\nThe_Lumineers\nLynyrd_Skynyrd\nThe_Mamas_&_the_Papas\nMarilyn_Manson\nThe_Marshall_Tucker_Band\nMatchbox_Twenty\nJohn_Mayer\nPaul_McCartney\nMeat_Loaf\nMegadeth\nJohn_Mellencamp\nMen_at_Work\nMetallica\nMidnight_Oil\nMike_and_the_Mechanics\nModest_Mouse\nEddie_Money\nThe_Monkees\nThe_Moody_Blues\nAlanis_Morissette\nVan_Morrison\nMorrissey\nMötley_Crüe\nMotörhead\nMudvayne\nMumford_&_Sons\nMuse_(band)\nMy_Chemical_Romance\nNickelback\nStevie_Nicks\nHarry_Nilsson\nNine_Inch_Nails\nNirvana_(band)\nNo_Doubt\nTed_Nugent\nOasis_(band)\nThe_Offspring\nRoy_Orbison\nOzzy_Osbourne\nOur_Lady_Peace\nThe_Outfield\nP.O.D.\nPanic!_at_the_Disco\nPantera\nPapa_Roach\nParamore\nPearl_Jam\nA_Perfect_Circle\nTom_Petty_and_the_Heartbreakers\nPink_Floyd\nPixies_(band)\nRobert_Plant\nPoison_(American_band)\nThe_Police\nIggy_Pop\nPop_Evil\nThe_Presidents_of_the_United_States_of_America_(band)\nThe_Pretenders\nElvis_Presley\nThe_Pretty_Reckless\nPrimus_(band)\nPuddle_of_Mudd\nQueen_(band)\nQueens_of_the_Stone_Age\nQueensrÿche\nQuiet_Riot\nR.E.M.\nRadiohead\nRage_Against_the_Machine\nRainbow_(rock_band)\nRammstein\nRamones\nRed_Hot_Chili_Peppers\nLou_Reed\nREO_Speedwagon\nRise_Against\nThe_Rolling_Stones\nLinda_Ronstadt\nRoxy_Music\nRoyal_Blood_(band)\nRush_(band)\nSaliva_(band)\nSam_Fender\nSantana_(band)\nJoe_Satriani\nSaving_Abel\nScorpions_(band)\nThe_Script\nSeether\nBob_Seger\nSepultura\nSex_Pistols\nShakin'_Stevens\nShinedown\nSilverchair\nSimon_&_Garfunkel\nSimple_Minds\nSimple_Plan\nSkid_Row_(American_band)\nSkillet_(band)\nSlade\nSlayer\nSlipknot_(band)\nSmall_Faces\nSmash_Mouth\nThe_Smashing_Pumpkins\nThe_Smiths\nSmokie_(band)\nSnow_Patrol\nSocial_Distortion\nSoundgarden\nBruce_Springsteen\nBilly_Squier\nStaind\nRingo_Starr\nStarset\nStarship_(band)\nStatus_Quo_(band)\nSteely_Dan\nSteppenwolf_(band)\nSteve_Miller_Band\nRod_Stewart\nSting_(musician)\nThe_Stone_Roses\nStone_Sour\nStone_Temple_Pilots\nThe_Strokes\nStyx_(band)\nSublime_(band)\nSum_41\nSupertramp\nSurvivor_(band)\nThe_Sweet\nSystem_of_a_Down\nT._Rex_(band)\nTalking_Heads\nJames_Taylor\nTenacious_D\nTesla_(band)\nTheory_of_a_Deadman\nThin_Lizzy\nThird_Eye_Blind\nThirty_Seconds_to_Mars\nGeorge_Thorogood\nThousand_Foot_Krutch\nThree_Days_Grace\nThree_Dog_Night\nTool_(band)\nToto_(band)\nTraffic_(band)\nThe_Tragically_Hip\nTrain_(band)\nTraveling_Wilburys\nTravis_(band)\nTrivium_(band)\nTwenty_One_Pilots\nTwisted_Sister\nU2\nUriah_Heep_(band)\nThe_Used\nSteve_Vai\nRitchie_Valens\nVampire_Weekend\nVan_Halen\nStevie_Ray_Vaughan\nVelvet_Revolver\nThe_Velvet_Underground\nThe_Verve\nVolbeat\nJoe_Walsh\nWarrant_(American_band)\nWeezer\nJack_White\nThe_White_Stripes\nWhite_Zombie_(band)\nWhitesnake\nThe_Who\nPaul_McCartney_and_Wings\nSteve_Winwood\nThe_Yardbirds\nYes_(band)\nNeil_Young\nFrank_Zappa\nRob_Zombie\nThe_Zombies\nZZ_Top\"\"\".split('\\n')","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:34:24.681697Z","iopub.execute_input":"2022-08-20T23:34:24.682223Z","iopub.status.idle":"2022-08-20T23:34:24.693942Z","shell.execute_reply.started":"2022-08-20T23:34:24.682178Z","shell.execute_reply":"2022-08-20T23:34:24.693004Z"},"trusted":true},"execution_count":54,"outputs":[]},{"cell_type":"code","source":"for i,raw_title in enumerate(pages_titles):\n if i%10==0:\n print(i/len(pages_titles)*100)\n try:\n page=wikipedia.page(title=raw_title.replace('_', ' '), auto_suggest=False)\n id_ = page.pageid\n url= page.url\n dic={'content': page.content,\n 'meta':{'name': page.title,\n 'url': url}}\n\n \n with open(f'/kaggle/working/rock_wiki/{id_}.json','w') as fo:\n json.dump(dic, fo)\n except Exception as e:\n traceback.print_exc()\n print(raw_title)\n ","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:34:49.157641Z","iopub.execute_input":"2022-08-20T23:34:49.158086Z","iopub.status.idle":"2022-08-20T23:44:29.346317Z","shell.execute_reply.started":"2022-08-20T23:34:49.158047Z","shell.execute_reply":"2022-08-20T23:44:29.345032Z"},"trusted":true},"execution_count":57,"outputs":[]},{"cell_type":"code","source":"! tar -czvf rock_wiki.tar.gz ./rock_wiki","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:50:44.643851Z","iopub.execute_input":"2022-08-20T23:50:44.644378Z","iopub.status.idle":"2022-08-20T23:50:44.650366Z","shell.execute_reply.started":"2022-08-20T23:50:44.644328Z","shell.execute_reply":"2022-08-20T23:50:44.649169Z"},"trusted":true},"execution_count":60,"outputs":[]}]} \ No newline at end of file +{ + "metadata": { + "kernelspec": { + "language": "python", + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.7.12", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + } + }, + "nbformat_minor": 4, + "nbformat": 4, + "cells": [ + { + "cell_type": "markdown", + "source": "# Download data from Wikipedia", + "metadata": {} + }, + { + "cell_type": "code", + "source": "# install wikipedia API python wrapper\n! pip install wikipedia", + "metadata": { + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "execution": { + "iopub.status.busy": "2022-08-20T21:43:59.293655Z", + "iopub.execute_input": "2022-08-20T21:43:59.294792Z", + "iopub.status.idle": "2022-08-20T21:44:15.263363Z", + "shell.execute_reply.started": "2022-08-20T21:43:59.294746Z", + "shell.execute_reply": "2022-08-20T21:44:15.262171Z" + }, + "trusted": true + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": "import wikipedia\nimport json\nimport traceback", + "metadata": { + "execution": { + "iopub.status.busy": "2022-08-20T21:44:15.265341Z", + "iopub.execute_input": "2022-08-20T21:44:15.265753Z", + "iopub.status.idle": "2022-08-20T21:44:15.470330Z", + "shell.execute_reply.started": "2022-08-20T21:44:15.265709Z", + "shell.execute_reply": "2022-08-20T21:44:15.468665Z" + }, + "trusted": true + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# titles to download, from https://en.wikipedia.org/wiki/List_of_mainstream_rock_performers\n", + "\n", + "pages_titles = \"\"\"10cc\n", + "10_Years_(band)\n", + "3_Doors_Down\n", + "311_(band)\n", + "38_Special_(band)\n", + "Accept_(band)\n", + "AC/DC\n", + "Bryan_Adams\n", + "Aerosmith\n", + "AFI_(band)\n", + "Air_Supply\n", + "The_Alan_Parsons_Project\n", + "Alice_in_Chains\n", + "The_All-American_Rejects\n", + "The_Allman_Brothers_Band\n", + "Alter_Bridge\n", + "Ambrosia_(band)\n", + "America_(band)\n", + "The_Animals\n", + "Adam_Ant\n", + "Anthrax_(American_band)\n", + "April_Wine\n", + "Arcade_Fire\n", + "Arctic_Monkeys\n", + "Asia_(band)\n", + "Audioslave\n", + "Avenged_Sevenfold\n", + "Awolnation\n", + "The_B-52's\n", + "Bachman–Turner_Overdrive\n", + "Bad_Company\n", + "Badfinger\n", + "The_Band\n", + "The_Bangles\n", + "Barenaked_Ladies\n", + "Bay_City_Rollers\n", + "The_Beach_Boys\n", + "The_Beatles\n", + "Beck\n", + "Ben_Folds_Five\n", + "Pat_Benatar\n", + "Chuck_Berry\n", + "The_Big_Bopper\n", + "Billy_Talent\n", + "The_Black_Crowes\n", + "The_Black_Keys\n", + "Black_Sabbath\n", + "Black_Stone_Cherry\n", + "Black_Veil_Brides\n", + "Blink-182\n", + "Bloodhound_Gang\n", + "Blue_October\n", + "Blue_Öyster_Cult\n", + "Blues_Traveler\n", + "James_Blunt\n", + "Blur_(band)\n", + "Bon_Jovi\n", + "Boston_(band)\n", + "David_Bowie\n", + "Bowling_for_Soup\n", + "Boys_Like_Girls\n", + "Bread_(band)\n", + "Breaking_Benjamin\n", + "Bring_Me_the_Horizon\n", + "Jackson_Browne\n", + "Buckcherry\n", + "Jeff_Buckley\n", + "Bullet_for_My_Valentine\n", + "Bush_(British_band)\n", + "The_Byrds\n", + "Cage_the_Elephant\n", + "Cake_(band)\n", + "Canned_Heat\n", + "The_Cab\n", + "The_Cardigans\n", + "The_Cars\n", + "Catfish_and_the_Bottlemen\n", + "Harry_Chapin\n", + "Tracy_Chapman\n", + "Cheap_Trick\n", + "Chevelle_(band)\n", + "Chicago_(band)\n", + "Chubby_Checker\n", + "Cinderella_(band)\n", + "Dallas_Green_(musician)\n", + "Eric_Clapton\n", + "The_Clash\n", + "Eddie_Cochran\n", + "Joe_Cocker\n", + "Coheed_and_Cambria\n", + "Cold_Chisel\n", + "Coldplay\n", + "Collective_Soul\n", + "Phil_Collins\n", + "Alice_Cooper\n", + "Chris_Cornell\n", + "Elvis_Costello\n", + "Counting_Crows\n", + "The_Cranberries\n", + "Crash_Test_Dummies\n", + "Cream_(band)\n", + "Creed_(band)\n", + "Creedence_Clearwater_Revival\n", + "Jim_Croce\n", + "Crosby,_Stills,_Nash_&_Young\n", + "Christopher_Cross\n", + "Sheryl_Crow\n", + "Crowded_House\n", + "The_Cult\n", + "The_Cure\n", + "Damn_Yankees_(band)\n", + "Dashboard_Confessional\n", + "Daughtry_(band)\n", + "The_Dave_Clark_Five\n", + "Dave_Matthews_Band\n", + "Days_of_the_New\n", + "Death_Cab_for_Cutie\n", + "Deep_Purple\n", + "Def_Leppard\n", + "Deftones\n", + "Depeche_Mode\n", + "Bo_Diddley\n", + "Dio_(band)\n", + "Dire_Straits\n", + "Disturbed_(band)\n", + "Fats_Domino\n", + "Donovan\n", + "The_Doobie_Brothers\n", + "The_Doors\n", + "Dr._Hook_&_the_Medicine_Show\n", + "Dropkick_Murphys\n", + "Drowning_Pool\n", + "Duran_Duran\n", + "Ian_Dury\n", + "Bob_Dylan\n", + "Eagles_(band)\n", + "Echo_&_the_Bunnymen\n", + "Duane_Eddy\n", + "Edgar_Winter\n", + "Electric_Light_Orchestra\n", + "Emerson,_Lake_&_Palmer\n", + "England_Dan_&_John_Ford_Coley\n", + "Melissa_Etheridge\n", + "Europe_(band)\n", + "Evanescence\n", + "Everclear_(band)\n", + "Everlast\n", + "The_Everly_Brothers\n", + "Extreme_(band)\n", + "Faces_(band)\n", + "Faith_No_More\n", + "Fall_Out_Boy\n", + "Bryan_Ferry\n", + "Filter_(band)\n", + "Finger_Eleven\n", + "FireHouse\n", + "Five_Finger_Death_Punch\n", + "Five_for_Fighting\n", + "The_Fixx\n", + "The_Flaming_Lips\n", + "Fleetwood_Mac\n", + "Flogging_Molly\n", + "Florence_and_the_Machine\n", + "Flyleaf_(band)\n", + "Foals_(band)\n", + "Dan_Fogelberg\n", + "John_Fogerty\n", + "Foo_Fighters\n", + "Foreigner_(band)\n", + "Foster_the_People\n", + "The_Four_Seasons_(band)\n", + "Peter_Frampton\n", + "Franz_Ferdinand_(band)\n", + "The_Fray\n", + "Glenn_Frey\n", + "Fuel_(band)\n", + "Fun_(band)\n", + "Peter_Gabriel\n", + "Garbage_(band)\n", + "Genesis_(band)\n", + "Ghost_(Swedish_band)\n", + "Gin_Blossoms\n", + "Gary_Glitter\n", + "The_Go-Go's\n", + "Godsmack\n", + "Golden_Earring\n", + "Goo_Goo_Dolls\n", + "Good_Charlotte\n", + "Grand_Funk_Railroad\n", + "Grateful_Dead\n", + "Great_White\n", + "Green_Day\n", + "Greta_Van_Fleet\n", + "The_Guess_Who\n", + "Guns_N'_Roses\n", + "Halestorm\n", + "Bill_Haley_&_His_Comets\n", + "Hall_&_Oates\n", + "George_Harrison\n", + "Heart_(band)\n", + "Jimi_Hendrix\n", + "Don_Henley\n", + "Herman's_Hermits\n", + "Highly_Suspect\n", + "Hinder\n", + "The_Hives\n", + "Hole_(band)\n", + "The_Hollies\n", + "Buddy_Holly\n", + "Hoobastank\n", + "Hootie_&_the_Blowfish\n", + "Icehouse_(band)\n", + "Billy_Idol\n", + "Imagine_Dragons\n", + "Incubus_(band)\n", + "Interpol_(band)\n", + "INXS\n", + "Iron_Maiden\n", + "The_J._Geils_Band\n", + "The_Jam\n", + "Tommy_James_and_the_Shondells\n", + "Jane's_Addiction\n", + "Jefferson_Airplane\n", + "Jefferson_Starship\n", + "The_Jesus_and_Mary_Chain\n", + "Jet_(Australian_band)\n", + "Jethro_Tull_(band)\n", + "Joan_Jett\n", + "Jimmy_Eat_World\n", + "Billy_Joel\n", + "Elton_John\n", + "Janis_Joplin\n", + "Journey_(band)\n", + "Joy_Division\n", + "Judas_Priest\n", + "Kaiser_Chiefs\n", + "Kaleo_(band)\n", + "Kansas_(band)\n", + "Keane_(band)\n", + "Kid_Rock\n", + "The_Killers\n", + "Killswitch_Engage\n", + "Kings_of_Leon\n", + "The_Kinks\n", + "Kiss_(band)\n", + "Korn\n", + "Lenny_Kravitz\n", + "Lacuna_Coil\n", + "Lamb_of_God_(band)\n", + "Avril_Lavigne\n", + "Led_Zeppelin\n", + "John_Lennon\n", + "Huey_Lewis_and_the_News\n", + "Jerry_Lee_Lewis\n", + "Lifehouse_(band)\n", + "Limp_Bizkit\n", + "Linkin_Park\n", + "Little_Richard\n", + "Little_River_Band\n", + "Live_(band)\n", + "Living_Colour\n", + "Kenny_Loggins\n", + "Loverboy\n", + "The_Lovin'_Spoonful\n", + "The_Lumineers\n", + "Lynyrd_Skynyrd\n", + "The_Mamas_&_the_Papas\n", + "Marilyn_Manson\n", + "The_Marshall_Tucker_Band\n", + "Matchbox_Twenty\n", + "John_Mayer\n", + "Paul_McCartney\n", + "Meat_Loaf\n", + "Megadeth\n", + "John_Mellencamp\n", + "Men_at_Work\n", + "Metallica\n", + "Midnight_Oil\n", + "Mike_and_the_Mechanics\n", + "Modest_Mouse\n", + "Eddie_Money\n", + "The_Monkees\n", + "The_Moody_Blues\n", + "Alanis_Morissette\n", + "Van_Morrison\n", + "Morrissey\n", + "Mötley_Crüe\n", + "Motörhead\n", + "Mudvayne\n", + "Mumford_&_Sons\n", + "Muse_(band)\n", + "My_Chemical_Romance\n", + "Nickelback\n", + "Stevie_Nicks\n", + "Harry_Nilsson\n", + "Nine_Inch_Nails\n", + "Nirvana_(band)\n", + "No_Doubt\n", + "Ted_Nugent\n", + "Oasis_(band)\n", + "The_Offspring\n", + "Roy_Orbison\n", + "Ozzy_Osbourne\n", + "Our_Lady_Peace\n", + "The_Outfield\n", + "P.O.D.\n", + "Panic!_at_the_Disco\n", + "Pantera\n", + "Papa_Roach\n", + "Paramore\n", + "Pearl_Jam\n", + "A_Perfect_Circle\n", + "Tom_Petty_and_the_Heartbreakers\n", + "Pink_Floyd\n", + "Pixies_(band)\n", + "Robert_Plant\n", + "Poison_(American_band)\n", + "The_Police\n", + "Iggy_Pop\n", + "Pop_Evil\n", + "The_Presidents_of_the_United_States_of_America_(band)\n", + "The_Pretenders\n", + "Elvis_Presley\n", + "The_Pretty_Reckless\n", + "Primus_(band)\n", + "Puddle_of_Mudd\n", + "Queen_(band)\n", + "Queens_of_the_Stone_Age\n", + "Queensrÿche\n", + "Quiet_Riot\n", + "R.E.M.\n", + "Radiohead\n", + "Rage_Against_the_Machine\n", + "Rainbow_(rock_band)\n", + "Rammstein\n", + "Ramones\n", + "Red_Hot_Chili_Peppers\n", + "Lou_Reed\n", + "REO_Speedwagon\n", + "Rise_Against\n", + "The_Rolling_Stones\n", + "Linda_Ronstadt\n", + "Roxy_Music\n", + "Royal_Blood_(band)\n", + "Rush_(band)\n", + "Saliva_(band)\n", + "Sam_Fender\n", + "Santana_(band)\n", + "Joe_Satriani\n", + "Saving_Abel\n", + "Scorpions_(band)\n", + "The_Script\n", + "Seether\n", + "Bob_Seger\n", + "Sepultura\n", + "Sex_Pistols\n", + "Shakin'_Stevens\n", + "Shinedown\n", + "Silverchair\n", + "Simon_&_Garfunkel\n", + "Simple_Minds\n", + "Simple_Plan\n", + "Skid_Row_(American_band)\n", + "Skillet_(band)\n", + "Slade\n", + "Slayer\n", + "Slipknot_(band)\n", + "Small_Faces\n", + "Smash_Mouth\n", + "The_Smashing_Pumpkins\n", + "The_Smiths\n", + "Smokie_(band)\n", + "Snow_Patrol\n", + "Social_Distortion\n", + "Soundgarden\n", + "Bruce_Springsteen\n", + "Billy_Squier\n", + "Staind\n", + "Ringo_Starr\n", + "Starset\n", + "Starship_(band)\n", + "Status_Quo_(band)\n", + "Steely_Dan\n", + "Steppenwolf_(band)\n", + "Steve_Miller_Band\n", + "Rod_Stewart\n", + "Sting_(musician)\n", + "The_Stone_Roses\n", + "Stone_Sour\n", + "Stone_Temple_Pilots\n", + "The_Strokes\n", + "Styx_(band)\n", + "Sublime_(band)\n", + "Sum_41\n", + "Supertramp\n", + "Survivor_(band)\n", + "The_Sweet\n", + "System_of_a_Down\n", + "T._Rex_(band)\n", + "Talking_Heads\n", + "James_Taylor\n", + "Tenacious_D\n", + "Tesla_(band)\n", + "Theory_of_a_Deadman\n", + "Thin_Lizzy\n", + "Third_Eye_Blind\n", + "Thirty_Seconds_to_Mars\n", + "George_Thorogood\n", + "Thousand_Foot_Krutch\n", + "Three_Days_Grace\n", + "Three_Dog_Night\n", + "Tool_(band)\n", + "Toto_(band)\n", + "Traffic_(band)\n", + "The_Tragically_Hip\n", + "Train_(band)\n", + "Traveling_Wilburys\n", + "Travis_(band)\n", + "Trivium_(band)\n", + "Twenty_One_Pilots\n", + "Twisted_Sister\n", + "U2\n", + "Uriah_Heep_(band)\n", + "The_Used\n", + "Steve_Vai\n", + "Ritchie_Valens\n", + "Vampire_Weekend\n", + "Van_Halen\n", + "Stevie_Ray_Vaughan\n", + "Velvet_Revolver\n", + "The_Velvet_Underground\n", + "The_Verve\n", + "Volbeat\n", + "Joe_Walsh\n", + "Warrant_(American_band)\n", + "Weezer\n", + "Jack_White\n", + "The_White_Stripes\n", + "White_Zombie_(band)\n", + "Whitesnake\n", + "The_Who\n", + "Paul_McCartney_and_Wings\n", + "Steve_Winwood\n", + "The_Yardbirds\n", + "Yes_(band)\n", + "Neil_Young\n", + "Frank_Zappa\n", + "Rob_Zombie\n", + "The_Zombies\n", + "ZZ_Top\"\"\".split(\n", + " \"\\n\"\n", + ")" + ], + "metadata": { + "execution": { + "iopub.status.busy": "2022-08-20T23:34:24.681697Z", + "iopub.execute_input": "2022-08-20T23:34:24.682223Z", + "iopub.status.idle": "2022-08-20T23:34:24.693942Z", + "shell.execute_reply.started": "2022-08-20T23:34:24.682178Z", + "shell.execute_reply": "2022-08-20T23:34:24.693004Z" + }, + "trusted": true + }, + "execution_count": 54, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "for i, raw_title in enumerate(pages_titles):\n", + " if i % 10 == 0:\n", + " print(i / len(pages_titles) * 100)\n", + " try:\n", + " page = wikipedia.page(title=raw_title.replace(\"_\", \" \"), auto_suggest=False)\n", + " id_ = page.pageid\n", + " url = page.url\n", + " dic = {\"content\": page.content, \"meta\": {\"name\": page.title, \"url\": url}}\n", + "\n", + " with open(f\"/kaggle/working/rock_wiki/{id_}.json\", \"w\") as fo:\n", + " json.dump(dic, fo)\n", + " except Exception as e:\n", + " traceback.print_exc()\n", + " print(raw_title)" + ], + "metadata": { + "execution": { + "iopub.status.busy": "2022-08-20T23:34:49.157641Z", + "iopub.execute_input": "2022-08-20T23:34:49.158086Z", + "iopub.status.idle": "2022-08-20T23:44:29.346317Z", + "shell.execute_reply.started": "2022-08-20T23:34:49.158047Z", + "shell.execute_reply": "2022-08-20T23:44:29.345032Z" + }, + "trusted": true + }, + "execution_count": 57, + "outputs": [] + }, + { + "cell_type": "code", + "source": "! tar -czvf rock_wiki.tar.gz ./rock_wiki", + "metadata": { + "execution": { + "iopub.status.busy": "2022-08-20T23:50:44.643851Z", + "iopub.execute_input": "2022-08-20T23:50:44.644378Z", + "iopub.status.idle": "2022-08-20T23:50:44.650366Z", + "shell.execute_reply.started": "2022-08-20T23:50:44.644328Z", + "shell.execute_reply": "2022-08-20T23:50:44.649169Z" + }, + "trusted": true + }, + "execution_count": 60, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/indexing.ipynb b/notebooks/indexing.ipynb index b5918f5..24fa2d5 100644 --- a/notebooks/indexing.ipynb +++ b/notebooks/indexing.ipynb @@ -1 +1,417 @@ -{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Indexing\n","Using [Haystack](https://github.com/deepset-ai/haystack), the following steps are performed:\n","- load and preprocess documents downloaded from Wikipedia\n","- create document store and write documents\n","- initialize retriever and generate document embeddings"]},{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","trusted":true},"outputs":[],"source":["! pip install farm-haystack[faiss-gpu]==1.7.0"]},{"cell_type":"markdown","metadata":{},"source":["## Load documents"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:23.692554Z","iopub.status.busy":"2022-08-21T08:23:23.692208Z","iopub.status.idle":"2022-08-21T08:23:23.700721Z","shell.execute_reply":"2022-08-21T08:23:23.698130Z","shell.execute_reply.started":"2022-08-21T08:23:23.692512Z"},"trusted":true},"outputs":[],"source":["import glob, json"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:23.707774Z","iopub.status.busy":"2022-08-21T08:23:23.704107Z","iopub.status.idle":"2022-08-21T08:23:25.026910Z","shell.execute_reply":"2022-08-21T08:23:25.025990Z","shell.execute_reply.started":"2022-08-21T08:23:23.705010Z"},"trusted":true},"outputs":[],"source":["docs=[]\n","\n","for json_file in glob.glob('../input/crawl-rock/rock_wiki/*.json'):\n"," with open(json_file, 'r') as fin:\n"," doc=json.load(fin)\n","\n"," docs.append(doc)\n"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:25.030530Z","iopub.status.busy":"2022-08-21T08:23:25.029931Z","iopub.status.idle":"2022-08-21T08:23:25.039324Z","shell.execute_reply":"2022-08-21T08:23:25.037960Z","shell.execute_reply.started":"2022-08-21T08:23:25.030491Z"},"trusted":true},"outputs":[{"data":{"text/plain":["453"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["len(docs)"]},{"cell_type":"markdown","metadata":{},"source":["## Preprocess documents"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:25.050479Z","iopub.status.busy":"2022-08-21T08:23:25.050099Z","iopub.status.idle":"2022-08-21T08:23:42.089083Z","shell.execute_reply":"2022-08-21T08:23:42.087929Z","shell.execute_reply.started":"2022-08-21T08:23:25.050446Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"108e8c46426f44e7be98a8ae930d81ce","version_major":2,"version_minor":0},"text/plain":["Preprocessing: 0%| | 0/453 [00:00,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ]"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["preprocessed_docs[:10]"]},{"cell_type":"markdown","metadata":{},"source":["## Create document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:42.119585Z","iopub.status.busy":"2022-08-21T08:23:42.118544Z","iopub.status.idle":"2022-08-21T08:23:42.124669Z","shell.execute_reply":"2022-08-21T08:23:42.123597Z","shell.execute_reply.started":"2022-08-21T08:23:42.119551Z"},"trusted":true},"outputs":[],"source":["from haystack.document_stores import FAISSDocumentStore\n","from haystack.nodes import EmbeddingRetriever"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:42.129562Z","iopub.status.busy":"2022-08-21T08:23:42.128772Z","iopub.status.idle":"2022-08-21T08:23:42.259879Z","shell.execute_reply":"2022-08-21T08:23:42.258950Z","shell.execute_reply.started":"2022-08-21T08:23:42.129518Z"},"trusted":true},"outputs":[],"source":["# the document store settings are those compatible with Embedding Retriever\n","document_store = FAISSDocumentStore(\n"," similarity=\"dot_product\",\n"," embedding_dim=768)"]},{"cell_type":"code","execution_count":46,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:43:25.952230Z","iopub.status.busy":"2022-08-21T08:43:25.951856Z","iopub.status.idle":"2022-08-21T08:46:12.506842Z","shell.execute_reply":"2022-08-21T08:46:12.505845Z","shell.execute_reply.started":"2022-08-21T08:43:25.952198Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"dbd72ecf0d36401ba26826f7d9a42540","version_major":2,"version_minor":0},"text/plain":["Writing Documents: 0%| | 0/50024 [00:00,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessed_docs[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# select only documents with at least 10 words. Otherwise, the documents are not very informative\n", + "preprocessed_docs = [doc for doc in preprocessed_docs if len(doc.content.split()) >= 10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2022-08-21T08:23:42.119585Z", + "iopub.status.busy": "2022-08-21T08:23:42.118544Z", + "iopub.status.idle": "2022-08-21T08:23:42.124669Z", + "shell.execute_reply": "2022-08-21T08:23:42.123597Z", + "shell.execute_reply.started": "2022-08-21T08:23:42.119551Z" + }, + "trusted": true + }, + "outputs": [], + "source": [ + "from haystack.document_stores import FAISSDocumentStore\n", + "from haystack.nodes import EmbeddingRetriever" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2022-08-21T08:23:42.129562Z", + "iopub.status.busy": "2022-08-21T08:23:42.128772Z", + "iopub.status.idle": "2022-08-21T08:23:42.259879Z", + "shell.execute_reply": "2022-08-21T08:23:42.258950Z", + "shell.execute_reply.started": "2022-08-21T08:23:42.129518Z" + }, + "trusted": true + }, + "outputs": [], + "source": [ + "# the document store settings are those compatible with Embedding Retriever\n", + "document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=768)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "execution": { + "iopub.execute_input": "2022-08-21T08:43:25.952230Z", + "iopub.status.busy": "2022-08-21T08:43:25.951856Z", + "iopub.status.idle": "2022-08-21T08:46:12.506842Z", + "shell.execute_reply": "2022-08-21T08:46:12.505845Z", + "shell.execute_reply.started": "2022-08-21T08:43:25.952198Z" + }, + "trusted": true + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dbd72ecf0d36401ba26826f7d9a42540", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Writing Documents: 0%| | 0/50024 [00:00