commit hive

programmingwithclaudio · Jan 29, 2024 · b9cea79 · b9cea79
1 parent 9c751c8
commit b9cea79
Show file tree

Hide file tree

Showing 4 changed files with 206 additions and 108 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ parquet_converter.py
 imported_data
 processed_data
 raw_data
+notebook/.ipynb_checkpoints
+notebook/spark-warehouse/
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -83,14 +83,24 @@ services:
     ports:
       - "8889:8888"
       - "8000:8000"
-    #volumes:
-      #- /home/ozi/pri:/home/jovyan
+    volumes:
+      - /home/ozi/pri:/home/jovyan
       # - ./charts:/home/jovyan/work
       # - ./imported_data:/home/jovyan/work/imported_data
       # - ./processed_data:/home/jovyan/work/processed_data
       # - ./raw_data:/home/jovyan/work/raw_data
     environment:
       - PYSPARK_PYTHON=python3
+      # Configuraciones de conexión a Hive
+      - HIVE_SERVER2_HOST=hive-server
+      - HIVE_SERVER2_PORT=10000
+      - HIVE_METASTORE_HOST=hive-metastore
+      - HIVE_METASTORE_PORT=9083
+      # Configuración de JDBC para Hive
+      - HIVE_JDBC_URL=jdbc:hive2://hive-server:10000/default
+      - HIVE_JDBC_DRIVER=org.apache.hive.jdbc.HiveDriver
+      - HIVE_JDBC_USER=hive
+      - HIVE_JDBC_PASSWORD=hive
 
 volumes:
   namenode:

diff --git a/hadoop-hive.env b/hadoop-hive.env
@@ -6,6 +6,8 @@ HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
 HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
 HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
 
+
+
 CORE_CONF_fs_defaultFS=hdfs://namenode:8020
 CORE_CONF_hadoop_http_staticuser_user=root
 CORE_CONF_hadoop_proxyuser_hue_hosts=*

diff --git a/notebook/1_ETL_lending.ipynb b/notebook/1_ETL_lending.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 1,
    "id": "275e7706-2235-4024-b488-b5737b961091",
    "metadata": {},
    "outputs": [],
@@ -135,104 +135,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "625eb239-731a-4fee-883c-bc21b2854183",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "root\n",
-      " |-- Amount Requested: date (nullable = true)\n",
-      " |-- Application Date: date (nullable = true)\n",
-      " |-- Loan Title: string (nullable = true)\n",
-      " |-- Risk_Score: float (nullable = true)\n",
-      " |-- Debt-To-Income Ratio: double (nullable = true)\n",
-      " |-- Zip Code: string (nullable = true)\n",
-      " |-- State: string (nullable = true)\n",
-      " |-- Employment Length: string (nullable = true)\n",
-      " |-- Policy Code: integer (nullable = true)\n",
-      " |-- Day: integer (nullable = true)\n",
-      " |-- Week: integer (nullable = true)\n",
-      " |-- Month: integer (nullable = true)\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "rejected_dfm.printSchema()"
+    "#rejected_dfm.printSchema()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "178d4b9f-15a4-4dcf-85f5-a24a48218988",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "+----------------+----------------+--------------------+----------+--------------------+--------+-----+-----------------+-----------+\n",
-      "|Amount Requested|Application Date|          Loan Title|Risk_Score|Debt-To-Income Ratio|Zip Code|State|Employment Length|Policy Code|\n",
-      "+----------------+----------------+--------------------+----------+--------------------+--------+-----+-----------------+-----------+\n",
-      "|          1000.0|      2007-05-26|Wedding Covered b...|     693.0|                 0.1|     481|   NM|          4 years|          0|\n",
-      "|          1000.0|      2007-05-26|  Consolidating Debt|     703.0|                 0.1|     010|   MA|         < 1 year|          0|\n",
-      "|         11000.0|      2007-05-27|Want to consolida...|     715.0|                 0.1|     212|   MD|           1 year|          0|\n",
-      "|          6000.0|      2007-05-27|             waksman|     698.0|  0.3863999938964844|     017|   MA|         < 1 year|          0|\n",
-      "|          1500.0|      2007-05-27|              mdrigo|     509.0| 0.09430000305175781|     209|   MD|         < 1 year|          0|\n",
-      "|         15000.0|      2007-05-27|          Trinfiniti|     645.0|                 0.0|     105|   NY|          3 years|          0|\n",
-      "|         10000.0|      2007-05-27|         NOTIFYi Inc|     693.0|                 0.1|     210|   MD|         < 1 year|          0|\n",
-      "|          3900.0|      2007-05-27|         For Justin.|     700.0|                 0.1|     469|   IN|          2 years|          0|\n",
-      "|          3000.0|      2007-05-28|              title?|     694.0|                 0.1|     808|   CO|          4 years|          0|\n",
-      "|          2500.0|      2007-05-28|            timgerst|     573.0| 0.11760000228881835|     407|   KY|          4 years|          0|\n",
-      "|          3900.0|      2007-05-28| need to consolidate|     710.0|                 0.1|     705|   LA|        10+ years|          0|\n",
-      "|          1000.0|      2007-05-28|          sixstrings|     680.0|                 0.1|     424|   KY|           1 year|          0|\n",
-      "|          3000.0|      2007-05-28|          bmoore5110|     688.0|                 0.1|     190|   PA|         < 1 year|          0|\n",
-      "|          1500.0|      2007-05-28|            MHarkins|     704.0|                 0.1|     189|   PA|          3 years|          0|\n",
-      "|          1000.0|      2007-05-28|              Moving|     694.0|                 0.1|     354|   AL|         < 1 year|          0|\n",
-      "|          8000.0|      2007-05-28|Recent College Gr...|     708.0|                 0.1|     374|   TN|         < 1 year|          0|\n",
-      "|         12000.0|      2007-05-29|    FoundersCafe.com|     685.0|                 0.1|     770|   TX|          3 years|          0|\n",
-      "|          1000.0|      2007-05-29|        UChicago2004|     698.0|                 0.1|     207|   MD|          3 years|          0|\n",
-      "|         15000.0|      2007-05-29|Cancer is Killing...|     680.0|                 0.1|     432|   OH|         < 1 year|          0|\n",
-      "|          5000.0|      2007-05-29|2006-2007 College...|     680.0|                 0.1|     011|   MA|         < 1 year|          0|\n",
-      "+----------------+----------------+--------------------+----------+--------------------+--------+-----+-----------------+-----------+\n",
-      "only showing top 20 rows\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "rejected_dfm.show()"
+    "#rejected_dfm.show()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "fb3e2457-6f14-4036-b0b9-446311054b62",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "root\n",
-      " |-- Amount Requested: double (nullable = true)\n",
-      " |-- Application Date: date (nullable = true)\n",
-      " |-- Loan Title: string (nullable = true)\n",
-      " |-- Risk_Score: float (nullable = true)\n",
-      " |-- Debt-To-Income Ratio: double (nullable = true)\n",
-      " |-- Zip Code: string (nullable = true)\n",
-      " |-- State: string (nullable = true)\n",
-      " |-- Employment Length: string (nullable = true)\n",
-      " |-- Policy Code: integer (nullable = true)\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "rejected_dfm.printSchema()"
+    "#rejected_dfm.printSchema()"
    ]
   },
   {
@@ -304,7 +232,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "id": "899d291f-981f-42a1-a84c-e16385fa0dee",
    "metadata": {},
    "outputs": [],
@@ -340,45 +268,201 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "6708a5fd-1a57-445c-85ec-6ea33f1f5d6d",
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 25,
+   "id": "39629363-4d27-45d1-9736-b552dc7673a7",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------+\n",
+      "|namespace|\n",
+      "+---------+\n",
+      "|  default|\n",
+      "|  lending|\n",
+      "+---------+\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
-    "# Ejemplo de cómo crear y almacenar un DataFrame en una tabla de Hive con formato Parquet\n",
-    "df.write.mode(\"overwrite\").parquet(\"/user/hive/warehouse/lending.db/train_data_parquet\")\n",
-    "spark.sql(\"CREATE TABLE IF NOT EXISTS lending.train_data_parquet USING PARQUET LOCATION '/user/hive/warehouse/lending.db/train_data_parquet'\")\n"
+    "# Definir la ubicación del almacén Hive en HDFS\n",
+    "warehouse_location = \"/user/hive/warehouse\"\n",
+    "\n",
+    "# Crear una sesión de Spark\n",
+    "spark = SparkSession.builder \\\n",
+    "    .appName(\"Test Hive Connection\") \\\n",
+    "    .config(\"spark.sql.warehouse.dir\", warehouse_location) \\\n",
+    "    .enableHiveSupport() \\\n",
+    "    .getOrCreate()\n",
+    "\n",
+    "# Ejecutar una consulta simple\n",
+    "spark.sql(\"SHOW DATABASES\").show()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "id": "a71aec30-1dd4-41cb-8e32-36a5bb69f9b9",
+   "execution_count": 27,
+   "id": "030093db-a839-476f-be3c-02758ff18a72",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Nombre de las tablas en Hive\n",
-    "hive_train_table = \"train_data\"\n",
-    "hive_test_table = \"test_data\"\n",
+    "# Define la base de datos y la tabla\n",
+    "lending_database = \"lending\"\n",
     "\n",
-    "# Crear bases de datos si no existen\n",
-    "spark.sql(\"CREATE DATABASE IF NOT EXISTS lending\")\n",
+    "lending_train = \"train_df\"  # Reemplaza con el nombre real de tu tabla\n",
+    "lending_test = \"test_df\"  # Reemplaza con el nombre real de tu tabla\n",
     "\n",
-    "# Cambiar a la base de datos\n",
-    "spark.sql(\"USE lending\")\n",
+    "# Cambiar a la base de datos 'lending'\n",
+    "spark.sql(f\"CREATE DATABASE IF NOT EXISTS {lending_database}\")\n",
+    "spark.sql(f\"USE {lending_database}\")\n",
     "\n",
-    "# Guardar los DataFrames como tablas en Hive\n",
-    "train_df.write.mode(\"overwrite\").saveAsTable(hive_train_table)\n",
-    "test_df.write.mode(\"overwrite\").saveAsTable(hive_test_table)"
+    "# Guardar el DataFrame en Hive\n",
+    "train_df.write.mode(\"overwrite\").saveAsTable(lending_train)\n",
+    "test_df.write.mode(\"overwrite\").saveAsTable(lending_test)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "998d6b05-cdf9-4e57-8386-5fc30ff06e71",
+   "execution_count": 28,
+   "id": "bb5a69a7-df7b-4fb7-90e0-de3e5949144f",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------+\n",
+      "|namespace|\n",
+      "+---------+\n",
+      "|  default|\n",
+      "|  lending|\n",
+      "+---------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Ejecutar una consulta simple\n",
+    "spark.sql(\"SHOW DATABASES\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "bad2c890-9eac-493f-bb33-8c2caffff5fb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------+---------+-----------+\n",
+      "|namespace|tableName|isTemporary|\n",
+      "+---------+---------+-----------+\n",
+      "|  lending|  test_df|      false|\n",
+      "|  lending| train_df|      false|\n",
+      "+---------+---------+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "spark.sql(\"SHOW TABLES\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "508ff7c0-6492-4b86-a3c7-0f89fc05cf36",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+----------------------------+----------------------------+-------+\n",
+      "|col_name                    |data_type                   |comment|\n",
+      "+----------------------------+----------------------------+-------+\n",
+      "|Amount Requested            |double                      |NULL   |\n",
+      "|Application Date            |date                        |NULL   |\n",
+      "|Loan Title                  |string                      |NULL   |\n",
+      "|Risk_Score                  |double                      |NULL   |\n",
+      "|Debt-To-Income Ratio        |double                      |NULL   |\n",
+      "|Zip Code                    |string                      |NULL   |\n",
+      "|State                       |string                      |NULL   |\n",
+      "|Employment Length           |string                      |NULL   |\n",
+      "|Policy Code                 |int                         |NULL   |\n",
+      "|Day                         |int                         |NULL   |\n",
+      "|Week                        |int                         |NULL   |\n",
+      "|Month                       |int                         |NULL   |\n",
+      "|                            |                            |       |\n",
+      "|# Detailed Table Information|                            |       |\n",
+      "|Catalog                     |spark_catalog               |       |\n",
+      "|Database                    |lending                     |       |\n",
+      "|Table                       |test_df                     |       |\n",
+      "|Created Time                |Mon Jan 29 02:28:44 UTC 2024|       |\n",
+      "|Last Access                 |UNKNOWN                     |       |\n",
+      "|Created By                  |Spark 3.5.0                 |       |\n",
+      "+----------------------------+----------------------------+-------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Reemplaza 'tu_tabla' con el nombre de la tabla que deseas explorar\n",
+    "tabla_seleccionada = 'test_df'\n",
+    "spark.sql(f\"DESCRIBE EXTENDED {tabla_seleccionada}\").show(truncate=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "7f9a6ba2-8ca4-4236-8edd-c06294f00e6b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+----------------------------+----------------------------+-------+\n",
+      "|col_name                    |data_type                   |comment|\n",
+      "+----------------------------+----------------------------+-------+\n",
+      "|Amount Requested            |double                      |NULL   |\n",
+      "|Application Date            |date                        |NULL   |\n",
+      "|Loan Title                  |string                      |NULL   |\n",
+      "|Risk_Score                  |double                      |NULL   |\n",
+      "|Debt-To-Income Ratio        |double                      |NULL   |\n",
+      "|Zip Code                    |string                      |NULL   |\n",
+      "|State                       |string                      |NULL   |\n",
+      "|Employment Length           |string                      |NULL   |\n",
+      "|Policy Code                 |int                         |NULL   |\n",
+      "|Day                         |int                         |NULL   |\n",
+      "|Week                        |int                         |NULL   |\n",
+      "|Month                       |int                         |NULL   |\n",
+      "|                            |                            |       |\n",
+      "|# Detailed Table Information|                            |       |\n",
+      "|Catalog                     |spark_catalog               |       |\n",
+      "|Database                    |lending                     |       |\n",
+      "|Table                       |train_df                    |       |\n",
+      "|Created Time                |Mon Jan 29 02:26:23 UTC 2024|       |\n",
+      "|Last Access                 |UNKNOWN                     |       |\n",
+      "|Created By                  |Spark 3.5.0                 |       |\n",
+      "+----------------------------+----------------------------+-------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Reemplaza 'tu_tabla' con el nombre de la tabla que deseas explorar\n",
+    "tabla_seleccionada = 'lending.train_df'\n",
+    "spark.sql(f\"DESCRIBE EXTENDED {tabla_seleccionada}\").show(truncate=False)"
+   ]
   }
  ],
  "metadata": {