Skip to content

Commit

Permalink
update notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
enjalot committed Nov 30, 2024
1 parent d4bba02 commit 8f50735
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 178 deletions.
272 changes: 95 additions & 177 deletions notebooks/common-corpus-sample.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -12,18 +12,32 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e2e6bc13bd5a4f66bda0e1190c589e3e",
"model_id": "22eb6e3c1c3e4ed099f635298a60744f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading readme: 0%| | 0.00/7.56k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "477bd80988f54375b8451fe760cbe3f1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading readme: 0%| | 0.00/23.2k [00:00<?, ?B/s]"
"Resolving data files: 0%| | 0/10009 [00:00<?, ?it/s]"
]
},
"metadata": {},
Expand All @@ -32,21 +46,21 @@
],
"source": [
"from datasets import load_dataset\n",
"dataset = load_dataset(\"HuggingFaceFW/fineweb-edu\", data_files=\"sample/10BT/*.parquet\", streaming=True, split=\"train\")\n"
"dataset = load_dataset(\"PleIAs/common_corpus\", streaming=True, split=\"train\")\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"dataset_head = dataset.take(10000)"
"dataset_head = dataset.take(100000)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -55,7 +69,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -79,126 +93,69 @@
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>identifier</th>\n",
" <th>collection</th>\n",
" <th>license</th>\n",
" <th>text</th>\n",
" <th>id</th>\n",
" <th>dump</th>\n",
" <th>url</th>\n",
" <th>file_path</th>\n",
" <th>language</th>\n",
" <th>language_score</th>\n",
" <th>token_count</th>\n",
" <th>score</th>\n",
" <th>int_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>The Independent Jane\\nFor all the love, romanc...</td>\n",
" <td>&lt;urn:uuid:0d8a309d-25c5-405d-a08a-c11239f0d717&gt;</td>\n",
" <td>CC-MAIN-2013-20</td>\n",
" <td>http://austenauthors.net/the-independent-jane</td>\n",
" <td>s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...</td>\n",
" <td>en</td>\n",
" <td>0.974320</td>\n",
" <td>845</td>\n",
" <td>2.750000</td>\n",
" <td>3</td>\n",
" <td>2021/52021XC0713(02)/52021XC0713(02)_SK.txt_5</td>\n",
" <td>Eurlex</td>\n",
" <td>CC-By</td>\n",
" <td>(37)  Riadenie cyklu projektu predstavuje proc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Taking Play Seriously\\nBy ROBIN MARANTZ HENIG\\...</td>\n",
" <td>&lt;urn:uuid:316c7af5-14e1-4d0b-9576-753e17ef2cc5&gt;</td>\n",
" <td>CC-MAIN-2013-20</td>\n",
" <td>http://query.nytimes.com/gst/fullpage.html?res...</td>\n",
" <td>s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...</td>\n",
" <td>en</td>\n",
" <td>0.961459</td>\n",
" <td>1055</td>\n",
" <td>2.562500</td>\n",
" <td>3</td>\n",
" <td>github_open_source_100_1_0</td>\n",
" <td>Github OpenSource</td>\n",
" <td>Various open source</td>\n",
" <td>// Copyright (c) 2021 Yoakke.\\n// Licensed und...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>How do you get HIV?\\nHIV can be passed on when...</td>\n",
" <td>&lt;urn:uuid:a3e140cd-7f25-48c9-a2f0-a7d0b1954e0d&gt;</td>\n",
" <td>CC-MAIN-2013-20</td>\n",
" <td>http://www.childline.org.uk/Explore/SexRelatio...</td>\n",
" <td>s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...</td>\n",
" <td>en</td>\n",
" <td>0.966757</td>\n",
" <td>136</td>\n",
" <td>3.125000</td>\n",
" <td>3</td>\n",
" <td>github_open_source_100_1_1</td>\n",
" <td>Github OpenSource</td>\n",
" <td>Various open source</td>\n",
" <td>#include &lt;sys/fcntl.h&gt;\\n#include &lt;unistd.h&gt;\\n#...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CTComms sends on average 2 million emails mont...</td>\n",
" <td>&lt;urn:uuid:c337bcd8-6aa1-4f2d-8c48-b916442ebbee&gt;</td>\n",
" <td>CC-MAIN-2013-20</td>\n",
" <td>http://www.ctt.org/resource_centre/getting_sta...</td>\n",
" <td>s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...</td>\n",
" <td>en</td>\n",
" <td>0.910602</td>\n",
" <td>3479</td>\n",
" <td>3.234375</td>\n",
" <td>3</td>\n",
" <td>sn83002748_1918-10-24_1_9_1</td>\n",
" <td>US-PD-Newspapers</td>\n",
" <td>Public Domain</td>\n",
" <td>POLITICAL ADVERTISEMENTS JOHN F. GOLDY, Os Sav...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Hold the salt: UCLA engineers develop revoluti...</td>\n",
" <td>&lt;urn:uuid:c0b175bb-65fb-420e-a881-a80b91d00ecd&gt;</td>\n",
" <td>CC-MAIN-2013-20</td>\n",
" <td>http://www.environment.ucla.edu/water/news/art...</td>\n",
" <td>s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...</td>\n",
" <td>en</td>\n",
" <td>0.924981</td>\n",
" <td>1115</td>\n",
" <td>2.812500</td>\n",
" <td>3</td>\n",
" <td>github_open_source_100_1_2</td>\n",
" <td>Github OpenSource</td>\n",
" <td>Various open source</td>\n",
" <td>{#\\n/**\\n * @file\\n * Theme override for a men...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text \\\n",
"0 The Independent Jane\\nFor all the love, romanc... \n",
"1 Taking Play Seriously\\nBy ROBIN MARANTZ HENIG\\... \n",
"2 How do you get HIV?\\nHIV can be passed on when... \n",
"3 CTComms sends on average 2 million emails mont... \n",
"4 Hold the salt: UCLA engineers develop revoluti... \n",
"\n",
" id dump \\\n",
"0 <urn:uuid:0d8a309d-25c5-405d-a08a-c11239f0d717> CC-MAIN-2013-20 \n",
"1 <urn:uuid:316c7af5-14e1-4d0b-9576-753e17ef2cc5> CC-MAIN-2013-20 \n",
"2 <urn:uuid:a3e140cd-7f25-48c9-a2f0-a7d0b1954e0d> CC-MAIN-2013-20 \n",
"3 <urn:uuid:c337bcd8-6aa1-4f2d-8c48-b916442ebbee> CC-MAIN-2013-20 \n",
"4 <urn:uuid:c0b175bb-65fb-420e-a881-a80b91d00ecd> CC-MAIN-2013-20 \n",
"\n",
" url \\\n",
"0 http://austenauthors.net/the-independent-jane \n",
"1 http://query.nytimes.com/gst/fullpage.html?res... \n",
"2 http://www.childline.org.uk/Explore/SexRelatio... \n",
"3 http://www.ctt.org/resource_centre/getting_sta... \n",
"4 http://www.environment.ucla.edu/water/news/art... \n",
"\n",
" file_path language language_score \\\n",
"0 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.974320 \n",
"1 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.961459 \n",
"2 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.966757 \n",
"3 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.910602 \n",
"4 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.924981 \n",
" identifier collection \\\n",
"0 2021/52021XC0713(02)/52021XC0713(02)_SK.txt_5 Eurlex \n",
"1 github_open_source_100_1_0 Github OpenSource \n",
"2 github_open_source_100_1_1 Github OpenSource \n",
"3 sn83002748_1918-10-24_1_9_1 US-PD-Newspapers \n",
"4 github_open_source_100_1_2 Github OpenSource \n",
"\n",
" token_count score int_score \n",
"0 845 2.750000 3 \n",
"1 1055 2.562500 3 \n",
"2 136 3.125000 3 \n",
"3 3479 3.234375 3 \n",
"4 1115 2.812500 3 "
" license text \n",
"0 CC-By (37)  Riadenie cyklu projektu predstavuje proc... \n",
"1 Various open source // Copyright (c) 2021 Yoakke.\\n// Licensed und... \n",
"2 Various open source #include <sys/fcntl.h>\\n#include <unistd.h>\\n#... \n",
"3 Public Domain POLITICAL ADVERTISEMENTS JOHN F. GOLDY, Os Sav... \n",
"4 Various open source {#\\n/**\\n * @file\\n * Theme override for a men... "
]
},
"execution_count": 11,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -209,7 +166,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -226,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -235,84 +192,45 @@
"text": [
"Loading environment variables from: /Users/enjalot/code/latent-scope/notebooks/.env\n",
"DATA DIR /Users/enjalot/latent-scope-demo\n",
"DIRECTORY /Users/enjalot/latent-scope-demo/fineweb-edu-10k\n",
" text \\\n",
"0 The Independent Jane\\nFor all the love, romanc... \n",
"1 Taking Play Seriously\\nBy ROBIN MARANTZ HENIG\\... \n",
"2 How do you get HIV?\\nHIV can be passed on when... \n",
"3 CTComms sends on average 2 million emails mont... \n",
"4 Hold the salt: UCLA engineers develop revoluti... \n",
"\n",
" id dump \\\n",
"0 <urn:uuid:0d8a309d-25c5-405d-a08a-c11239f0d717> CC-MAIN-2013-20 \n",
"1 <urn:uuid:316c7af5-14e1-4d0b-9576-753e17ef2cc5> CC-MAIN-2013-20 \n",
"2 <urn:uuid:a3e140cd-7f25-48c9-a2f0-a7d0b1954e0d> CC-MAIN-2013-20 \n",
"3 <urn:uuid:c337bcd8-6aa1-4f2d-8c48-b916442ebbee> CC-MAIN-2013-20 \n",
"4 <urn:uuid:c0b175bb-65fb-420e-a881-a80b91d00ecd> CC-MAIN-2013-20 \n",
"\n",
" url \\\n",
"0 http://austenauthors.net/the-independent-jane \n",
"1 http://query.nytimes.com/gst/fullpage.html?res... \n",
"2 http://www.childline.org.uk/Explore/SexRelatio... \n",
"3 http://www.ctt.org/resource_centre/getting_sta... \n",
"4 http://www.environment.ucla.edu/water/news/art... \n",
"\n",
" file_path language language_score \\\n",
"0 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.974320 \n",
"1 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.961459 \n",
"2 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.966757 \n",
"3 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.910602 \n",
"4 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en 0.924981 \n",
"\n",
" token_count score int_score \n",
"0 845 2.750000 3 \n",
"1 1055 2.562500 3 \n",
"2 136 3.125000 3 \n",
"3 3479 3.234375 3 \n",
"4 1115 2.812500 3 \n",
" text \\\n",
"9995 Here we have the inspiration for the movie tre... \n",
"9996 Love and Logic Resource KitLove and Logic is a... \n",
"9997 In the event of fire, people need to know exac... \n",
"9998 It may be a small comfort to those planning th... \n",
"9999 A 13-year-old middle school student is working... \n",
"\n",
" id dump \\\n",
"9995 <urn:uuid:57ae955d-687d-497f-93d4-d5314a541145> CC-MAIN-2017-26 \n",
"9996 <urn:uuid:3df9d504-e03a-4ef2-93ae-1b0fe24baa5e> CC-MAIN-2017-26 \n",
"9997 <urn:uuid:cbd2548e-361a-4de4-98e6-b5ecd485bf4f> CC-MAIN-2017-26 \n",
"9998 <urn:uuid:51ee7105-5715-47c0-a4d7-d6c1b39d3344> CC-MAIN-2017-26 \n",
"9999 <urn:uuid:5f525003-bf93-42d1-b05b-29a50aacfb63> CC-MAIN-2017-26 \n",
"\n",
" url \\\n",
"9995 https://www.hamahamaoysters.com/blogs/learn/18... \n",
"9996 http://holly.rpes.schoolfusion.us/modules/cms/... \n",
"9997 http://churchsafety.org.uk/information/fire/f_... \n",
"9998 http://insideindustrynews.com/curiosity-gives-... \n",
"9999 http://juneauempire.com/stories/120505/loc_200... \n",
"DIRECTORY /Users/enjalot/latent-scope-demo/common-corpus-100k\n",
" identifier collection \\\n",
"0 2021/52021XC0713(02)/52021XC0713(02)_SK.txt_5 Eurlex \n",
"1 github_open_source_100_1_0 Github OpenSource \n",
"2 github_open_source_100_1_1 Github OpenSource \n",
"3 sn83002748_1918-10-24_1_9_1 US-PD-Newspapers \n",
"4 github_open_source_100_1_2 Github OpenSource \n",
"\n",
" file_path language \\\n",
"9995 s3://commoncrawl/crawl-data/CC-MAIN-2017-26/se... en \n",
"9996 s3://commoncrawl/crawl-data/CC-MAIN-2017-26/se... en \n",
"9997 s3://commoncrawl/crawl-data/CC-MAIN-2017-26/se... en \n",
"9998 s3://commoncrawl/crawl-data/CC-MAIN-2017-26/se... en \n",
"9999 s3://commoncrawl/crawl-data/CC-MAIN-2017-26/se... en \n",
" license text \n",
"0 CC-By (37)  Riadenie cyklu projektu predstavuje proc... \n",
"1 Various open source // Copyright (c) 2021 Yoakke.\\n// Licensed und... \n",
"2 Various open source #include <sys/fcntl.h>\\n#include <unistd.h>\\n#... \n",
"3 Public Domain POLITICAL ADVERTISEMENTS JOHN F. GOLDY, Os Sav... \n",
"4 Various open source {#\\n/**\\n * @file\\n * Theme override for a men... \n",
" identifier collection \\\n",
"99995 jbc.bj.uj.edu.pl.NDIGCZAS018898_66909803_1 Polish-PD \n",
"99996 US-19956008-A_1 USPTO \n",
"99997 github_open_source_100_2_8355 Github OpenSource \n",
"99998 github_open_source_100_2_8356 Github OpenSource \n",
"99999 arithmeticinplai00fish_3 English-PD \n",
"\n",
" language_score token_count score int_score \n",
"9995 0.961133 368 2.875000 3 \n",
"9996 0.895080 249 2.828125 3 \n",
"9997 0.960923 1081 3.171875 3 \n",
"9998 0.938971 141 2.968750 3 \n",
"9999 0.981334 1131 2.859375 3 \n",
"Index(['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score',\n",
" 'token_count', 'score', 'int_score'],\n",
" dtype='object')\n",
"wrote /Users/enjalot/latent-scope-demo/fineweb-edu-10k/input.parquet\n"
" license text \n",
"99995 Public Domain UK Abouament: kwartalnie: w ekspedycji . „ 180... \n",
"99996 Public Domain Vehicle seat belt guiding device\\n\\nABSTRACT\\n... \n",
"99997 Various open source var __ref = require(\"./../../internal/ref\");\\n... \n",
"99998 Various open source @font-face {\\n font-family: \"Alegreya Sans\"... \n",
"99999 Public Domain Prices: $4. J Product by 2. J MULTIPLICATION, ... \n",
"Index(['identifier', 'collection', 'license', 'text'], dtype='object')\n",
"checking column types\n",
"COLUMN identifier TYPE string\n",
"COLUMN collection TYPE string\n",
"COLUMN license TYPE string\n",
"COLUMN text TYPE string\n",
"wrote /Users/enjalot/latent-scope-demo/common-corpus-100k/input.parquet\n"
]
}
],
"source": [
"ls.ingest(\"fineweb-edu-10k\", df, \"text\")"
"ls.ingest(\"common-corpus-100k\", df, \"text\")"
]
},
{
Expand All @@ -339,7 +257,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion notebooks/fineweb-edu-sample.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 8f50735

Please sign in to comment.