Prep for 0.1b3 release (#184)

* Get tests working on Pandas 1.0.x * Re-enable Feather example in intro notebook * Rerun notebooks prior to release * Rerun tutorial notebooks prior to release * Update version number
CODAIT · Apr 1, 2021 · 844a525 · 844a525
1 parent dd521f7
commit 844a525
Show file tree

Hide file tree

Showing 11 changed files with 216 additions and 163 deletions.
diff --git a/notebooks/Analyze_Text.ipynb b/notebooks/Analyze_Text.ipynb
@@ -143,7 +143,7 @@
     {
      "data": {
       "text/plain": [
-       "<ibm_watson.natural_language_understanding_v1.NaturalLanguageUnderstandingV1 at 0x7fb258940150>"
+       "<ibm_watson.natural_language_understanding_v1.NaturalLanguageUnderstandingV1 at 0x7ff68869a510>"
       ]
      },
      "execution_count": 3,

diff --git a/notebooks/Integrate_NLP_Libraries.ipynb b/notebooks/Integrate_NLP_Libraries.ipynb
diff --git a/notebooks/Model_Training_with_BERT.ipynb b/notebooks/Model_Training_with_BERT.ipynb
@@ -1643,7 +1643,7 @@
     {
      "data": {
       "text/plain": [
-       "<text_extensions_for_pandas.array.tensor.TensorDtype at 0x7ff8d05e0290>"
+       "<text_extensions_for_pandas.array.tensor.TensorDtype at 0x7fb13131ee10>"
       ]
      },
      "execution_count": 13,
@@ -1996,7 +1996,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "061593082c6e43f8bbbdab066a447502",
+       "model_id": "9e44a9af650543e59d81dfd8d5baa4ed",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2017,7 +2017,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d2e59e90113648dfb0a929c90ff7d1fb",
+       "model_id": "acc0673fceae466f94cc16ccfbd67fdd",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2038,7 +2038,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "716b10c7c7a840048c1a780ff1723d84",
+       "model_id": "a0f64dff11304556ad22cd8df77954b7",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3183,8 +3183,8 @@
      "output_type": "stream",
      "text": [
       "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
-      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.1min remaining:    0.0s\n",
-      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.1min finished\n"
+      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 11.9min remaining:    0.0s\n",
+      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 11.9min finished\n"
      ]
     },
     {
@@ -5041,7 +5041,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0a775ec7ee9f42ccb4367432d97f6958",
+       "model_id": "fde70dc5306b41f09a4844106b127aa1",
        "version_major": 2,
        "version_minor": 0
       },

diff --git a/notebooks/Text_Extensions_for_Pandas_Overview.ipynb b/notebooks/Text_Extensions_for_Pandas_Overview.ipynb
@@ -1522,7 +1522,7 @@
        "        [4, 5],\n",
        "        [6, 7],\n",
        "        [8, 9]]),\n",
-       " <text_extensions_for_pandas.array.tensor.TensorDtype at 0x7fb2e82c5d10>)"
+       " <text_extensions_for_pandas.array.tensor.TensorDtype at 0x7fe6a86432d0>)"
       ]
      },
      "execution_count": 22,
@@ -1903,7 +1903,7 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>[0, 2): 'In'</td>\n",
-       "      <td>[0, 1, 0, 0]</td>\n",
+       "      <td>[0, 0, 1, 0]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -1918,24 +1918,24 @@
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>[11, 15): 'King'</td>\n",
-       "      <td>[0, 0, 0, 1]</td>\n",
+       "      <td>[0, 1, 0, 0]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>[16, 22): 'Arthur'</td>\n",
-       "      <td>[0, 1, 0, 0]</td>\n",
+       "      <td>[0, 0, 1, 0]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "                 span      features\n",
-       "0        [0, 2): 'In'  [0, 1, 0, 0]\n",
+       "0        [0, 2): 'In'  [0, 0, 1, 0]\n",
        "1        [3, 5): 'AD'  [0, 1, 0, 0]\n",
        "2       [6, 9): '932'  [0, 0, 0, 1]\n",
-       "3    [11, 15): 'King'  [0, 0, 0, 1]\n",
-       "4  [16, 22): 'Arthur'  [0, 1, 0, 0]"
+       "3    [11, 15): 'King'  [0, 1, 0, 0]\n",
+       "4  [16, 22): 'Arthur'  [0, 0, 1, 0]"
       ]
      },
      "execution_count": 32,
@@ -1958,22 +1958,88 @@
     "# Save DataFrame to a feather file.\n",
     "# Feather is a lightweight, fast binary columnar format, with basic\n",
     "# compression and support built into Pandas.\n",
-    "\n",
-    "# TODO: Temporarily disabled while we revamp Feather support to handle multi-doc span arrays\n",
-    "#df.to_feather(\"outputs/tp_overview.feather\")"
+    "df.to_feather(\"outputs/tp_overview.feather\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 34,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>span</th>\n",
+       "      <th>features</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[0, 2): 'In'</td>\n",
+       "      <td>[0, 0, 1, 0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[3, 5): 'AD'</td>\n",
+       "      <td>[0, 1, 0, 0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[6, 9): '932'</td>\n",
+       "      <td>[0, 0, 0, 1]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[11, 15): 'King'</td>\n",
+       "      <td>[0, 1, 0, 0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[16, 22): 'Arthur'</td>\n",
+       "      <td>[0, 0, 1, 0]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 span      features\n",
+       "0        [0, 2): 'In'  [0, 0, 1, 0]\n",
+       "1        [3, 5): 'AD'  [0, 1, 0, 0]\n",
+       "2       [6, 9): '932'  [0, 0, 0, 1]\n",
+       "3    [11, 15): 'King'  [0, 1, 0, 0]\n",
+       "4  [16, 22): 'Arthur'  [0, 0, 1, 0]"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Read the file back into a new DataFrame.\n",
     "\n",
-    "# TODO: Temporarily disabled while we revamp Feather support to handle multi-doc span arrays\n",
-    "#df_load = pd.read_feather(\"outputs/tp_overview.feather\")\n",
-    "#df_load.head()"
+    "df_load = pd.read_feather(\"outputs/tp_overview.feather\")\n",
+    "df_load.head()"
    ]
   },
   {

diff --git a/notebooks/Understand_Tables.ipynb b/notebooks/Understand_Tables.ipynb
diff --git a/setup.py b/setup.py
@@ -24,7 +24,7 @@
 
 setuptools.setup(
     name="text_extensions_for_pandas",
-    version="0.1b2",
+    version="0.1b3",
     author="IBM",
     author_email="[email protected]",
     description="Natural language processing support for Pandas dataframes.",

diff --git a/text_extensions_for_pandas/array/test_token_span.py b/text_extensions_for_pandas/array/test_token_span.py
@@ -518,7 +518,8 @@ def data_for_grouping(dtype):
     return pd.array([b, b, na, na, a, a, b, c], dtype=dtype)
 
 
-# Can't import due to dependencies, taken from pandas.conftest import all_compare_operators
+# Can't import due to dependencies, taken
+# from pandas.conftest import all_compare_operators
 @pytest.fixture(params=["__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"])
 def all_compare_operators(request):
     return request.param
@@ -552,14 +553,10 @@ class TestPandasConstructors(base.BaseConstructorsTests):
     def test_series_constructor_no_data_with_index(self, dtype, na_value):
         pass
 
+    @pytest.mark.skipif(pd.__version__.startswith("1.0"),
+                        reason="Test added in Pandas 1.1.0")
     def test_construct_empty_dataframe(self, dtype):
         super().test_construct_empty_dataframe(dtype)
-        # try:
-        #     with pytest.raises(TypeError, match="Expected SpanArray as tokens"):
-        #         super().test_construct_empty_dataframe(dtype)
-        # except AttributeError:
-        #     # Test added in Pandas 1.1.0, ignore for earlier versions
-        #     pass
 
 
 class TestPandasGetitem(base.BaseGetitemTests):

diff --git a/tutorials/corpus/CoNLL_2.ipynb b/tutorials/corpus/CoNLL_2.ipynb
@@ -3730,7 +3730,7 @@
        "\n",
        "<div id=\"spanArray\">\n",
        "    <div id=\"spans\" \n",
-       "     style=\"background-color:#F0F0F0; border: 1px solid #E0E0E0; float:left; padding:10px;\">\n",
+       "     style=\"color: var(--jp-layout-color2); border: 1px solid var(--jp-border-color0); float:left; padding:10px;\">\n",
        "        <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
@@ -3899,11 +3899,11 @@
        "</table>\n",
        "    </div>\n",
        "    <div id=\"text\"\n",
-       "     style=\"float:right; background-color:#F5F5F5; border: 1px solid #E0E0E0; width: 60%;\">\n",
+       "     style=\"float:right; border: 1px solid var(--jp-border-color0); width: 60%;\">\n",
        "\n",
        "                <div style=\"float:center; padding:10px\">\n",
-       "                    <p style=\"font-family:monospace\">\n",
-       "                        -DOCSTART-<br><span style=\"background-color:yellow\">Belgian</span> police smash major drugs rings, 30 arrested.<br><span style=\"background-color:yellow\">BRUSSELS</span> 1996-12-06<br>Police smashed two drugs smuggling rings and arrested 30 people after a taxidriver in <span style=\"background-color:yellow\">Spain</span> alerted them to a suitcase of heroin left in his cab, <span style=\"background-color:yellow\">Belgian</span> police said on Friday.<br>Police seized dozens of kilos of heroin with a street value of hundreds of millions of <span style=\"background-color:yellow\">Belgian</span> francs, a public prosecutor&#39;s office spokesman in the port city of <span style=\"background-color:yellow\">Antwerp</span> said.<br>He said a 24-year-old <span style=\"background-color:yellow\">Belgian</span> woman left a suitcase containing 13 kg (29 lb) of heroin in a taxi in <span style=\"background-color:yellow\">Barcelona</span>.<br>The taxidriver alerted police who arrested a 33-year-old <span style=\"background-color:yellow\">Turkish</span> man when he came to pick up the suitcase at a lost luggage office.<br>The woman was later arrested in <span style=\"background-color:yellow\">Belgium</span>.<br>She and the <span style=\"background-color:yellow\">Turkish</span> man smuggled heroin from <span style=\"background-color:yellow\">Turkey</span> to <span style=\"background-color:yellow\">Antwerp</span> from where it was taken to <span style=\"background-color:yellow\">Spain</span>, <span style=\"background-color:yellow\">France</span> and <span style=\"background-color:yellow\">Germany</span> by others, the spokesman said.<br>He said 14 people were arrested in <span style=\"background-color:yellow\">Belgium</span> and 16 others in other <span style=\"background-color:yellow\">European</span> nations after an investigation lasting nearly a year.<br>(<span>&#36;</span>1=32.14 <span style=\"background-color:yellow\">Belgian</span> Franc)\n",
+       "                    <p style=\"font-family:var(--jp-code-font-family); font-size:var(--jp-code-font-size)\">\n",
+       "                        -DOCSTART-<br><span style=\"background-color:rgba(255, 215, 0, 0.5)\">Belgian</span> police smash major drugs rings, 30 arrested.<br><span style=\"background-color:rgba(255, 215, 0, 0.5)\">BRUSSELS</span> 1996-12-06<br>Police smashed two drugs smuggling rings and arrested 30 people after a taxidriver in <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Spain</span> alerted them to a suitcase of heroin left in his cab, <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Belgian</span> police said on Friday.<br>Police seized dozens of kilos of heroin with a street value of hundreds of millions of <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Belgian</span> francs, a public prosecutor&#39;s office spokesman in the port city of <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Antwerp</span> said.<br>He said a 24-year-old <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Belgian</span> woman left a suitcase containing 13 kg (29 lb) of heroin in a taxi in <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Barcelona</span>.<br>The taxidriver alerted police who arrested a 33-year-old <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Turkish</span> man when he came to pick up the suitcase at a lost luggage office.<br>The woman was later arrested in <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Belgium</span>.<br>She and the <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Turkish</span> man smuggled heroin from <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Turkey</span> to <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Antwerp</span> from where it was taken to <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Spain</span>, <span style=\"background-color:rgba(255, 215, 0, 0.5)\">France</span> and <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Germany</span> by others, the spokesman said.<br>He said 14 people were arrested in <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Belgium</span> and 16 others in other <span style=\"background-color:rgba(255, 215, 0, 0.5)\">European</span> nations after an investigation lasting nearly a year.<br>(<span>&#36;</span>1=32.14 <span style=\"background-color:rgba(255, 215, 0, 0.5)\">Belgian</span> Franc)\n",
        "                    </p>\n",
        "                </div>\n",
        "\n",

diff --git a/tutorials/corpus/CoNLL_3.ipynb b/tutorials/corpus/CoNLL_3.ipynb
@@ -1805,7 +1805,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "815b01606369445c892dedefbfd4916b",
+       "model_id": "0a612388df9249dab67efb5a4d358d5c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1826,7 +1826,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "daa8e13738e2453c93e334d08b6d251b",
+       "model_id": "cae2fc8df4a44049be700f26f4f20e88",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1847,7 +1847,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f5e610805fac4f44b706c223dc091820",
+       "model_id": "83c8f7f605eb4a55ab194aad964e947f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3027,8 +3027,8 @@
      "output_type": "stream",
      "text": [
       "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
-      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 41.7min remaining:    0.0s\n",
-      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 41.7min finished\n"
+      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 46.1min remaining:    0.0s\n",
+      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 46.1min finished\n"
      ]
     },
     {
@@ -6006,7 +6006,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "49d7275934fe48f3a27017d92225844a",
+       "model_id": "af7f756dbfc2467c9cf525caa83b83eb",
        "version_major": 2,
        "version_minor": 0
       },
@@ -6499,12 +6499,12 @@
     {
      "data": {
       "text/plain": [
-       "{'num_true_positives': 4169,\n",
+       "{'num_true_positives': 4329,\n",
        " 'num_entities': 5648,\n",
-       " 'num_extracted': 4929,\n",
-       " 'precision': 0.8458105092310814,\n",
-       " 'recall': 0.7381373937677054,\n",
-       " 'F1': 0.7883142668053323}"
+       " 'num_extracted': 5163,\n",
+       " 'precision': 0.8384660081348053,\n",
+       " 'recall': 0.7664660056657224,\n",
+       " 'F1': 0.8008509851077606}"
       ]
      },
      "execution_count": 38,
@@ -6965,7 +6965,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "571d376cdc19420d93df405970d42435",
+       "model_id": "63868a5aeb4e4847ba9b7df10e6d28b5",
        "version_major": 2,
        "version_minor": 0
       },
@@ -8598,7 +8598,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ab550997976f4d9b9ea777894d28fd12",
+       "model_id": "13b1edab9e1241ccb22166e9c0c8ca40",
        "version_major": 2,
        "version_minor": 0
       },
@@ -10072,7 +10072,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bd25f50dd1aa4bed9c9fc148b87603b5",
+       "model_id": "7110dc85e13145a2a9ab455aaa167948",
        "version_major": 2,
        "version_minor": 0
       },