Skip to content

Commit

Permalink
commit hive
Browse files Browse the repository at this point in the history
  • Loading branch information
programmingwithclaudio committed Jan 29, 2024
1 parent 9c751c8 commit b9cea79
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 108 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ parquet_converter.py
imported_data
processed_data
raw_data
notebook/.ipynb_checkpoints
notebook/spark-warehouse/
14 changes: 12 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,24 @@ services:
ports:
- "8889:8888"
- "8000:8000"
#volumes:
#- /home/ozi/pri:/home/jovyan
volumes:
- /home/ozi/pri:/home/jovyan
# - ./charts:/home/jovyan/work
# - ./imported_data:/home/jovyan/work/imported_data
# - ./processed_data:/home/jovyan/work/processed_data
# - ./raw_data:/home/jovyan/work/raw_data
environment:
- PYSPARK_PYTHON=python3
# Configuraciones de conexión a Hive
- HIVE_SERVER2_HOST=hive-server
- HIVE_SERVER2_PORT=10000
- HIVE_METASTORE_HOST=hive-metastore
- HIVE_METASTORE_PORT=9083
# Configuración de JDBC para Hive
- HIVE_JDBC_URL=jdbc:hive2://hive-server:10000/default
- HIVE_JDBC_DRIVER=org.apache.hive.jdbc.HiveDriver
- HIVE_JDBC_USER=hive
- HIVE_JDBC_PASSWORD=hive

volumes:
namenode:
Expand Down
2 changes: 2 additions & 0 deletions hadoop-hive.env
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false



CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
Expand Down
296 changes: 190 additions & 106 deletions notebook/1_ETL_lending.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 1,
"id": "275e7706-2235-4024-b488-b5737b961091",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -135,104 +135,32 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 5,
"id": "625eb239-731a-4fee-883c-bc21b2854183",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- Amount Requested: date (nullable = true)\n",
" |-- Application Date: date (nullable = true)\n",
" |-- Loan Title: string (nullable = true)\n",
" |-- Risk_Score: float (nullable = true)\n",
" |-- Debt-To-Income Ratio: double (nullable = true)\n",
" |-- Zip Code: string (nullable = true)\n",
" |-- State: string (nullable = true)\n",
" |-- Employment Length: string (nullable = true)\n",
" |-- Policy Code: integer (nullable = true)\n",
" |-- Day: integer (nullable = true)\n",
" |-- Week: integer (nullable = true)\n",
" |-- Month: integer (nullable = true)\n",
"\n"
]
}
],
"outputs": [],
"source": [
"rejected_dfm.printSchema()"
"#rejected_dfm.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "178d4b9f-15a4-4dcf-85f5-a24a48218988",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------------+----------------+--------------------+----------+--------------------+--------+-----+-----------------+-----------+\n",
"|Amount Requested|Application Date| Loan Title|Risk_Score|Debt-To-Income Ratio|Zip Code|State|Employment Length|Policy Code|\n",
"+----------------+----------------+--------------------+----------+--------------------+--------+-----+-----------------+-----------+\n",
"| 1000.0| 2007-05-26|Wedding Covered b...| 693.0| 0.1| 481| NM| 4 years| 0|\n",
"| 1000.0| 2007-05-26| Consolidating Debt| 703.0| 0.1| 010| MA| < 1 year| 0|\n",
"| 11000.0| 2007-05-27|Want to consolida...| 715.0| 0.1| 212| MD| 1 year| 0|\n",
"| 6000.0| 2007-05-27| waksman| 698.0| 0.3863999938964844| 017| MA| < 1 year| 0|\n",
"| 1500.0| 2007-05-27| mdrigo| 509.0| 0.09430000305175781| 209| MD| < 1 year| 0|\n",
"| 15000.0| 2007-05-27| Trinfiniti| 645.0| 0.0| 105| NY| 3 years| 0|\n",
"| 10000.0| 2007-05-27| NOTIFYi Inc| 693.0| 0.1| 210| MD| < 1 year| 0|\n",
"| 3900.0| 2007-05-27| For Justin.| 700.0| 0.1| 469| IN| 2 years| 0|\n",
"| 3000.0| 2007-05-28| title?| 694.0| 0.1| 808| CO| 4 years| 0|\n",
"| 2500.0| 2007-05-28| timgerst| 573.0| 0.11760000228881835| 407| KY| 4 years| 0|\n",
"| 3900.0| 2007-05-28| need to consolidate| 710.0| 0.1| 705| LA| 10+ years| 0|\n",
"| 1000.0| 2007-05-28| sixstrings| 680.0| 0.1| 424| KY| 1 year| 0|\n",
"| 3000.0| 2007-05-28| bmoore5110| 688.0| 0.1| 190| PA| < 1 year| 0|\n",
"| 1500.0| 2007-05-28| MHarkins| 704.0| 0.1| 189| PA| 3 years| 0|\n",
"| 1000.0| 2007-05-28| Moving| 694.0| 0.1| 354| AL| < 1 year| 0|\n",
"| 8000.0| 2007-05-28|Recent College Gr...| 708.0| 0.1| 374| TN| < 1 year| 0|\n",
"| 12000.0| 2007-05-29| FoundersCafe.com| 685.0| 0.1| 770| TX| 3 years| 0|\n",
"| 1000.0| 2007-05-29| UChicago2004| 698.0| 0.1| 207| MD| 3 years| 0|\n",
"| 15000.0| 2007-05-29|Cancer is Killing...| 680.0| 0.1| 432| OH| < 1 year| 0|\n",
"| 5000.0| 2007-05-29|2006-2007 College...| 680.0| 0.1| 011| MA| < 1 year| 0|\n",
"+----------------+----------------+--------------------+----------+--------------------+--------+-----+-----------------+-----------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"outputs": [],
"source": [
"rejected_dfm.show()"
"#rejected_dfm.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "fb3e2457-6f14-4036-b0b9-446311054b62",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- Amount Requested: double (nullable = true)\n",
" |-- Application Date: date (nullable = true)\n",
" |-- Loan Title: string (nullable = true)\n",
" |-- Risk_Score: float (nullable = true)\n",
" |-- Debt-To-Income Ratio: double (nullable = true)\n",
" |-- Zip Code: string (nullable = true)\n",
" |-- State: string (nullable = true)\n",
" |-- Employment Length: string (nullable = true)\n",
" |-- Policy Code: integer (nullable = true)\n",
"\n"
]
}
],
"outputs": [],
"source": [
"rejected_dfm.printSchema()"
"#rejected_dfm.printSchema()"
]
},
{
Expand Down Expand Up @@ -304,7 +232,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 11,
"id": "899d291f-981f-42a1-a84c-e16385fa0dee",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -340,45 +268,201 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "6708a5fd-1a57-445c-85ec-6ea33f1f5d6d",
"metadata": {},
"outputs": [],
"execution_count": 25,
"id": "39629363-4d27-45d1-9736-b552dc7673a7",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+\n",
"|namespace|\n",
"+---------+\n",
"| default|\n",
"| lending|\n",
"+---------+\n",
"\n"
]
}
],
"source": [
"# Ejemplo de cómo crear y almacenar un DataFrame en una tabla de Hive con formato Parquet\n",
"df.write.mode(\"overwrite\").parquet(\"/user/hive/warehouse/lending.db/train_data_parquet\")\n",
"spark.sql(\"CREATE TABLE IF NOT EXISTS lending.train_data_parquet USING PARQUET LOCATION '/user/hive/warehouse/lending.db/train_data_parquet'\")\n"
"# Definir la ubicación del almacén Hive en HDFS\n",
"warehouse_location = \"/user/hive/warehouse\"\n",
"\n",
"# Crear una sesión de Spark\n",
"spark = SparkSession.builder \\\n",
" .appName(\"Test Hive Connection\") \\\n",
" .config(\"spark.sql.warehouse.dir\", warehouse_location) \\\n",
" .enableHiveSupport() \\\n",
" .getOrCreate()\n",
"\n",
"# Ejecutar una consulta simple\n",
"spark.sql(\"SHOW DATABASES\").show()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a71aec30-1dd4-41cb-8e32-36a5bb69f9b9",
"execution_count": 27,
"id": "030093db-a839-476f-be3c-02758ff18a72",
"metadata": {},
"outputs": [],
"source": [
"# Nombre de las tablas en Hive\n",
"hive_train_table = \"train_data\"\n",
"hive_test_table = \"test_data\"\n",
"# Define la base de datos y la tabla\n",
"lending_database = \"lending\"\n",
"\n",
"# Crear bases de datos si no existen\n",
"spark.sql(\"CREATE DATABASE IF NOT EXISTS lending\")\n",
"lending_train = \"train_df\" # Reemplaza con el nombre real de tu tabla\n",
"lending_test = \"test_df\" # Reemplaza con el nombre real de tu tabla\n",
"\n",
"# Cambiar a la base de datos\n",
"spark.sql(\"USE lending\")\n",
"# Cambiar a la base de datos 'lending'\n",
"spark.sql(f\"CREATE DATABASE IF NOT EXISTS {lending_database}\")\n",
"spark.sql(f\"USE {lending_database}\")\n",
"\n",
"# Guardar los DataFrames como tablas en Hive\n",
"train_df.write.mode(\"overwrite\").saveAsTable(hive_train_table)\n",
"test_df.write.mode(\"overwrite\").saveAsTable(hive_test_table)"
"# Guardar el DataFrame en Hive\n",
"train_df.write.mode(\"overwrite\").saveAsTable(lending_train)\n",
"test_df.write.mode(\"overwrite\").saveAsTable(lending_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "998d6b05-cdf9-4e57-8386-5fc30ff06e71",
"execution_count": 28,
"id": "bb5a69a7-df7b-4fb7-90e0-de3e5949144f",
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+\n",
"|namespace|\n",
"+---------+\n",
"| default|\n",
"| lending|\n",
"+---------+\n",
"\n"
]
}
],
"source": [
"# Ejecutar una consulta simple\n",
"spark.sql(\"SHOW DATABASES\").show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "bad2c890-9eac-493f-bb33-8c2caffff5fb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+---------+-----------+\n",
"|namespace|tableName|isTemporary|\n",
"+---------+---------+-----------+\n",
"| lending| test_df| false|\n",
"| lending| train_df| false|\n",
"+---------+---------+-----------+\n",
"\n"
]
}
],
"source": [
"spark.sql(\"SHOW TABLES\").show()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "508ff7c0-6492-4b86-a3c7-0f89fc05cf36",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------------------------+----------------------------+-------+\n",
"|col_name |data_type |comment|\n",
"+----------------------------+----------------------------+-------+\n",
"|Amount Requested |double |NULL |\n",
"|Application Date |date |NULL |\n",
"|Loan Title |string |NULL |\n",
"|Risk_Score |double |NULL |\n",
"|Debt-To-Income Ratio |double |NULL |\n",
"|Zip Code |string |NULL |\n",
"|State |string |NULL |\n",
"|Employment Length |string |NULL |\n",
"|Policy Code |int |NULL |\n",
"|Day |int |NULL |\n",
"|Week |int |NULL |\n",
"|Month |int |NULL |\n",
"| | | |\n",
"|# Detailed Table Information| | |\n",
"|Catalog |spark_catalog | |\n",
"|Database |lending | |\n",
"|Table |test_df | |\n",
"|Created Time |Mon Jan 29 02:28:44 UTC 2024| |\n",
"|Last Access |UNKNOWN | |\n",
"|Created By |Spark 3.5.0 | |\n",
"+----------------------------+----------------------------+-------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"# Reemplaza 'tu_tabla' con el nombre de la tabla que deseas explorar\n",
"tabla_seleccionada = 'test_df'\n",
"spark.sql(f\"DESCRIBE EXTENDED {tabla_seleccionada}\").show(truncate=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "7f9a6ba2-8ca4-4236-8edd-c06294f00e6b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------------------------+----------------------------+-------+\n",
"|col_name |data_type |comment|\n",
"+----------------------------+----------------------------+-------+\n",
"|Amount Requested |double |NULL |\n",
"|Application Date |date |NULL |\n",
"|Loan Title |string |NULL |\n",
"|Risk_Score |double |NULL |\n",
"|Debt-To-Income Ratio |double |NULL |\n",
"|Zip Code |string |NULL |\n",
"|State |string |NULL |\n",
"|Employment Length |string |NULL |\n",
"|Policy Code |int |NULL |\n",
"|Day |int |NULL |\n",
"|Week |int |NULL |\n",
"|Month |int |NULL |\n",
"| | | |\n",
"|# Detailed Table Information| | |\n",
"|Catalog |spark_catalog | |\n",
"|Database |lending | |\n",
"|Table |train_df | |\n",
"|Created Time |Mon Jan 29 02:26:23 UTC 2024| |\n",
"|Last Access |UNKNOWN | |\n",
"|Created By |Spark 3.5.0 | |\n",
"+----------------------------+----------------------------+-------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"# Reemplaza 'tu_tabla' con el nombre de la tabla que deseas explorar\n",
"tabla_seleccionada = 'lending.train_df'\n",
"spark.sql(f\"DESCRIBE EXTENDED {tabla_seleccionada}\").show(truncate=False)"
]
}
],
"metadata": {
Expand Down

0 comments on commit b9cea79

Please sign in to comment.