From 8b084c8dbf40ef193deb09935c8891bee981c38d Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Mon, 26 Apr 2021 13:43:18 +0900 Subject: [PATCH] Update the TPCDS schema based on the Spark codebase --- .../databricks/spark/sql/perf/Tables.scala | 34 +- .../spark/sql/perf/tpcds/GenTPCDSData.scala | 7 +- .../spark/sql/perf/tpcds/TPCDSTables.scala | 937 ++++++++++-------- 3 files changed, 534 insertions(+), 444 deletions(-) diff --git a/src/main/scala/com/databricks/spark/sql/perf/Tables.scala b/src/main/scala/com/databricks/spark/sql/perf/Tables.scala index 177d38ce..5eba1f5f 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Tables.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Tables.scala @@ -25,6 +25,7 @@ import org.slf4j.LoggerFactory import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.sql.ColumnName import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext, SaveMode} @@ -95,7 +96,8 @@ trait DataGenerator extends Serializable { abstract class Tables(sqlContext: SQLContext, scaleFactor: String, - useDoubleForDecimal: Boolean = false, useStringForDate: Boolean = false) + useDoubleForDecimal: Boolean = false, useStringForDate: Boolean = false, + useStringForCharVarchar: Boolean = true) extends Serializable { def dataGenerator: DataGenerator @@ -105,11 +107,21 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, def sparkContext = sqlContext.sparkContext - case class Table(name: String, partitionColumns: Seq[String], fields: StructField*) { - val schema = StructType(fields) + object Table { + + def apply(name: String, partitionColumns: Seq[String], fields: StructField*): Table = { + Table(name, partitionColumns, StructType(fields)) + } + + def apply(name: String, partitionColumns: Seq[String], schemaString: String): Table = { + Table(name, partitionColumns, StructType.fromDDL(schemaString)) + } + } + + case class Table(name: String, partitionColumns: Seq[String], schema: StructType) { def nonPartitioned: Table = { - Table(name, Nil, fields : _*) + Table(name, Nil, schema) } /** @@ -144,7 +156,12 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, val convertedData = { val columns = schema.fields.map { f => - col(f.name).cast(f.dataType).as(f.name) + val expr = f.dataType match { + // Needs right-side padding for char types + case CharType(n) => rpad(new ColumnName(f.name), n, " ") + case _ => new ColumnName(f.name).cast(f.dataType) + } + expr.as(f.name) } stringData.select(columns: _*) } @@ -156,16 +173,17 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, } def convertTypes(): Table = { - val newFields = fields.map { field => + val newFields = schema.fields.map { field => val newDataType = field.dataType match { case decimal: DecimalType if useDoubleForDecimal => DoubleType case date: DateType if useStringForDate => StringType + case _: CharType | _: VarcharType if useStringForCharVarchar => StringType case other => other } field.copy(dataType = newDataType) } - Table(name, partitionColumns, newFields:_*) + Table(name, partitionColumns, StructType(newFields)) } def genData( @@ -274,7 +292,7 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, log.info(s"Analyzing table $name.") sqlContext.sql(s"ANALYZE TABLE $databaseName.$name COMPUTE STATISTICS") if (analyzeColumns) { - val allColumns = fields.map(_.name).mkString(", ") + val allColumns = schema.fields.map(_.name).mkString(", ") println(s"Analyzing table $name columns $allColumns.") log.info(s"Analyzing table $name columns $allColumns.") sqlContext.sql(s"ANALYZE TABLE $databaseName.$name COMPUTE STATISTICS FOR COLUMNS $allColumns") diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/GenTPCDSData.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/GenTPCDSData.scala index d3414844..79517d73 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/GenTPCDSData.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/GenTPCDSData.scala @@ -26,6 +26,7 @@ case class GenTPCDSDataConfig( format: String = null, useDoubleForDecimal: Boolean = false, useStringForDate: Boolean = false, + useStringForCharVarchar: Boolean = true, overwrite: Boolean = false, partitionTables: Boolean = true, clusterByPartitionColumns: Boolean = true, @@ -65,6 +66,9 @@ object GenTPCDSData { opt[Boolean]('e', "useStringForDate") .action((x, c) => c.copy(useStringForDate = x)) .text("true to replace DateType with StringType") + opt[Boolean]('r', "useStringForCharVarchar") + .action((x, c) => c.copy(useStringForCharVarchar = x)) + .text("true to replace CharType/VarcharType with StringType") opt[Boolean]('o', "overwrite") .action((x, c) => c.copy(overwrite = x)) .text("overwrite the data that is already there") @@ -106,7 +110,8 @@ object GenTPCDSData { dsdgenDir = config.dsdgenDir, scaleFactor = config.scaleFactor, useDoubleForDecimal = config.useDoubleForDecimal, - useStringForDate = config.useStringForDate) + useStringForDate = config.useStringForDate, + useStringForCharVarchar = config.useStringForCharVarchar) tables.genData( location = config.location, diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDSTables.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDSTables.scala index 8243cd34..fd3e0cb2 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDSTables.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDSTables.scala @@ -16,10 +16,7 @@ package com.databricks.spark.sql.perf.tpcds -import scala.sys.process._ - -import com.databricks.spark.sql.perf -import com.databricks.spark.sql.perf.{BlockingLineStream, DataGenerator, Table, Tables} +import com.databricks.spark.sql.perf.{BlockingLineStream, DataGenerator, Tables} import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext @@ -59,484 +56,554 @@ class TPCDSTables( dsdgenDir: String, scaleFactor: String, useDoubleForDecimal: Boolean = false, - useStringForDate: Boolean = false) - extends Tables(sqlContext, scaleFactor, useDoubleForDecimal, useStringForDate) { - import sqlContext.implicits._ + useStringForDate: Boolean = false, + useStringForCharVarchar: Boolean = true) + extends Tables(sqlContext, scaleFactor, useDoubleForDecimal, useStringForDate, useStringForCharVarchar) { val dataGenerator = new DSDGEN(dsdgenDir) + + /** + * Datatype mapping for TPC-DS and Spark SQL, fully matching schemas defined in `tpcds.sql` of the + * official tpcds toolkit + * see more at: + * http://www.tpc.org/tpc_documents_current_versions/pdf/tpc-ds_v2.9.0.pdf + * + * |---------------|---------------| + * | TPC-DS | Spark SQL | + * |---------------|---------------| + * | Identifier | INT | + * |---------------|---------------| + * | Integer | INT | + * |---------------|---------------| + * | Decimal(d, f) | Decimal(d, f) | + * |---------------|---------------| + * | Char(N) | Char(N) | + * |---------------|---------------| + * | Varchar(N) | Varchar(N) | + * |---------------|---------------| + * | Date | Date | + * |---------------|---------------| + */ val tables = Seq( Table("catalog_sales", partitionColumns = "cs_sold_date_sk" :: Nil, - 'cs_sold_date_sk .int, - 'cs_sold_time_sk .int, - 'cs_ship_date_sk .int, - 'cs_bill_customer_sk .int, - 'cs_bill_cdemo_sk .int, - 'cs_bill_hdemo_sk .int, - 'cs_bill_addr_sk .int, - 'cs_ship_customer_sk .int, - 'cs_ship_cdemo_sk .int, - 'cs_ship_hdemo_sk .int, - 'cs_ship_addr_sk .int, - 'cs_call_center_sk .int, - 'cs_catalog_page_sk .int, - 'cs_ship_mode_sk .int, - 'cs_warehouse_sk .int, - 'cs_item_sk .int, - 'cs_promo_sk .int, - 'cs_order_number .long, - 'cs_quantity .int, - 'cs_wholesale_cost .decimal(7,2), - 'cs_list_price .decimal(7,2), - 'cs_sales_price .decimal(7,2), - 'cs_ext_discount_amt .decimal(7,2), - 'cs_ext_sales_price .decimal(7,2), - 'cs_ext_wholesale_cost .decimal(7,2), - 'cs_ext_list_price .decimal(7,2), - 'cs_ext_tax .decimal(7,2), - 'cs_coupon_amt .decimal(7,2), - 'cs_ext_ship_cost .decimal(7,2), - 'cs_net_paid .decimal(7,2), - 'cs_net_paid_inc_tax .decimal(7,2), - 'cs_net_paid_inc_ship .decimal(7,2), - 'cs_net_paid_inc_ship_tax .decimal(7,2), - 'cs_net_profit .decimal(7,2)), + """ + |`cs_sold_date_sk` INT, + |`cs_sold_time_sk` INT, + |`cs_ship_date_sk` INT, + |`cs_bill_customer_sk` INT, + |`cs_bill_cdemo_sk` INT, + |`cs_bill_hdemo_sk` INT, + |`cs_bill_addr_sk` INT, + |`cs_ship_customer_sk` INT, + |`cs_ship_cdemo_sk` INT, + |`cs_ship_hdemo_sk` INT, + |`cs_ship_addr_sk` INT, + |`cs_call_center_sk` INT, + |`cs_catalog_page_sk` INT, + |`cs_ship_mode_sk` INT, + |`cs_warehouse_sk` INT, + |`cs_item_sk` INT, + |`cs_promo_sk` INT, + |`cs_order_number` INT, + |`cs_quantity` INT, + |`cs_wholesale_cost` DECIMAL(7,2), + |`cs_list_price` DECIMAL(7,2), + |`cs_sales_price` DECIMAL(7,2), + |`cs_ext_discount_amt` DECIMAL(7,2), + |`cs_ext_sales_price` DECIMAL(7,2), + |`cs_ext_wholesale_cost` DECIMAL(7,2), + |`cs_ext_list_price` DECIMAL(7,2), + |`cs_ext_tax` DECIMAL(7,2), + |`cs_coupon_amt` DECIMAL(7,2), + |`cs_ext_ship_cost` DECIMAL(7,2), + |`cs_net_paid` DECIMAL(7,2), + |`cs_net_paid_inc_tax` DECIMAL(7,2), + |`cs_net_paid_inc_ship` DECIMAL(7,2), + |`cs_net_paid_inc_ship_tax` DECIMAL(7,2), + |`cs_net_profit` DECIMAL(7,2) + """.stripMargin), Table("catalog_returns", - partitionColumns = "cr_returned_date_sk" :: Nil, - 'cr_returned_date_sk .int, - 'cr_returned_time_sk .int, - 'cr_item_sk .int, - 'cr_refunded_customer_sk .int, - 'cr_refunded_cdemo_sk .int, - 'cr_refunded_hdemo_sk .int, - 'cr_refunded_addr_sk .int, - 'cr_returning_customer_sk .int, - 'cr_returning_cdemo_sk .int, - 'cr_returning_hdemo_sk .int, - 'cr_returning_addr_sk .int, - 'cr_call_center_sk .int, - 'cr_catalog_page_sk .int, - 'cr_ship_mode_sk .int, - 'cr_warehouse_sk .int, - 'cr_reason_sk .int, - 'cr_order_number .long, - 'cr_return_quantity .int, - 'cr_return_amount .decimal(7,2), - 'cr_return_tax .decimal(7,2), - 'cr_return_amt_inc_tax .decimal(7,2), - 'cr_fee .decimal(7,2), - 'cr_return_ship_cost .decimal(7,2), - 'cr_refunded_cash .decimal(7,2), - 'cr_reversed_charge .decimal(7,2), - 'cr_store_credit .decimal(7,2), - 'cr_net_loss .decimal(7,2)), + "cr_returned_date_sk" :: Nil, + """ + |`cr_returned_date_sk` INT, + |`cr_returned_time_sk` INT, + |`cr_item_sk` INT, + |`cr_refunded_customer_sk` INT, + |`cr_refunded_cdemo_sk` INT, + |`cr_refunded_hdemo_sk` INT, + |`cr_refunded_addr_sk` INT, + |`cr_returning_customer_sk` INT, + |`cr_returning_cdemo_sk` INT, + |`cr_returning_hdemo_sk` INT, + |`cr_returning_addr_sk` INT, + |`cr_call_center_sk` INT, + |`cr_catalog_page_sk` INT, + |`cr_ship_mode_sk` INT, + |`cr_warehouse_sk` INT, + |`cr_reason_sk` INT,`cr_order_number` INT, + |`cr_return_quantity` INT, + |`cr_return_amount` DECIMAL(7,2), + |`cr_return_tax` DECIMAL(7,2), + |`cr_return_amt_inc_tax` DECIMAL(7,2), + |`cr_fee` DECIMAL(7,2), + |`cr_return_ship_cost` DECIMAL(7,2), + |`cr_refunded_cash` DECIMAL(7,2), + |`cr_reversed_charge` DECIMAL(7,2), + |`cr_store_credit` DECIMAL(7,2), + |`cr_net_loss` DECIMAL(7,2) + """.stripMargin), Table("inventory", partitionColumns = "inv_date_sk" :: Nil, - 'inv_date_sk .int, - 'inv_item_sk .int, - 'inv_warehouse_sk .int, - 'inv_quantity_on_hand .int), + """ + |`inv_date_sk` INT, + |`inv_item_sk` INT, + |`inv_warehouse_sk` INT, + |`inv_quantity_on_hand` INT + """.stripMargin), Table("store_sales", partitionColumns = "ss_sold_date_sk" :: Nil, - 'ss_sold_date_sk .int, - 'ss_sold_time_sk .int, - 'ss_item_sk .int, - 'ss_customer_sk .int, - 'ss_cdemo_sk .int, - 'ss_hdemo_sk .int, - 'ss_addr_sk .int, - 'ss_store_sk .int, - 'ss_promo_sk .int, - 'ss_ticket_number .long, - 'ss_quantity .int, - 'ss_wholesale_cost .decimal(7,2), - 'ss_list_price .decimal(7,2), - 'ss_sales_price .decimal(7,2), - 'ss_ext_discount_amt .decimal(7,2), - 'ss_ext_sales_price .decimal(7,2), - 'ss_ext_wholesale_cost.decimal(7,2), - 'ss_ext_list_price .decimal(7,2), - 'ss_ext_tax .decimal(7,2), - 'ss_coupon_amt .decimal(7,2), - 'ss_net_paid .decimal(7,2), - 'ss_net_paid_inc_tax .decimal(7,2), - 'ss_net_profit .decimal(7,2)), + """ + |`ss_sold_date_sk` INT, + |`ss_sold_time_sk` INT, + |`ss_item_sk` INT, + |`ss_customer_sk` INT, + |`ss_cdemo_sk` INT, + |`ss_hdemo_sk` INT, + |`ss_addr_sk` INT, + |`ss_store_sk` INT, + |`ss_promo_sk` INT, + |`ss_ticket_number` INT, + |`ss_quantity` INT, + |`ss_wholesale_cost` DECIMAL(7,2), + |`ss_list_price` DECIMAL(7,2), + |`ss_sales_price` DECIMAL(7,2), + |`ss_ext_discount_amt` DECIMAL(7,2), + |`ss_ext_sales_price` DECIMAL(7,2), + |`ss_ext_wholesale_cost` DECIMAL(7,2), + |`ss_ext_list_price` DECIMAL(7,2), + |`ss_ext_tax` DECIMAL(7,2), + |`ss_coupon_amt` DECIMAL(7,2), + |`ss_net_paid` DECIMAL(7,2), + |`ss_net_paid_inc_tax` DECIMAL(7,2), + |`ss_net_profit` DECIMAL(7,2) + """.stripMargin), Table("store_returns", - partitionColumns = "sr_returned_date_sk" ::Nil, - 'sr_returned_date_sk .int, - 'sr_return_time_sk .int, - 'sr_item_sk .int, - 'sr_customer_sk .int, - 'sr_cdemo_sk .int, - 'sr_hdemo_sk .int, - 'sr_addr_sk .int, - 'sr_store_sk .int, - 'sr_reason_sk .int, - 'sr_ticket_number .long, - 'sr_return_quantity .int, - 'sr_return_amt .decimal(7,2), - 'sr_return_tax .decimal(7,2), - 'sr_return_amt_inc_tax.decimal(7,2), - 'sr_fee .decimal(7,2), - 'sr_return_ship_cost .decimal(7,2), - 'sr_refunded_cash .decimal(7,2), - 'sr_reversed_charge .decimal(7,2), - 'sr_store_credit .decimal(7,2), - 'sr_net_loss .decimal(7,2)), + partitionColumns = "sr_returned_date_sk" :: Nil, + """ + |`sr_returned_date_sk` INT, + |`sr_return_time_sk` INT, + |`sr_item_sk` INT, + |`sr_customer_sk` INT, + |`sr_cdemo_sk` INT, + |`sr_hdemo_sk` INT, + |`sr_addr_sk` INT, + |`sr_store_sk` INT, + |`sr_reason_sk` INT, + |`sr_ticket_number` INT, + |`sr_return_quantity` INT, + |`sr_return_amt` DECIMAL(7,2), + |`sr_return_tax` DECIMAL(7,2), + |`sr_return_amt_inc_tax` DECIMAL(7,2), + |`sr_fee` DECIMAL(7,2), + |`sr_return_ship_cost` DECIMAL(7,2), + |`sr_refunded_cash` DECIMAL(7,2), + |`sr_reversed_charge` DECIMAL(7,2), + |`sr_store_credit` DECIMAL(7,2), + |`sr_net_loss` DECIMAL(7,2) + """.stripMargin), Table("web_sales", partitionColumns = "ws_sold_date_sk" :: Nil, - 'ws_sold_date_sk .int, - 'ws_sold_time_sk .int, - 'ws_ship_date_sk .int, - 'ws_item_sk .int, - 'ws_bill_customer_sk .int, - 'ws_bill_cdemo_sk .int, - 'ws_bill_hdemo_sk .int, - 'ws_bill_addr_sk .int, - 'ws_ship_customer_sk .int, - 'ws_ship_cdemo_sk .int, - 'ws_ship_hdemo_sk .int, - 'ws_ship_addr_sk .int, - 'ws_web_page_sk .int, - 'ws_web_site_sk .int, - 'ws_ship_mode_sk .int, - 'ws_warehouse_sk .int, - 'ws_promo_sk .int, - 'ws_order_number .long, - 'ws_quantity .int, - 'ws_wholesale_cost .decimal(7,2), - 'ws_list_price .decimal(7,2), - 'ws_sales_price .decimal(7,2), - 'ws_ext_discount_amt .decimal(7,2), - 'ws_ext_sales_price .decimal(7,2), - 'ws_ext_wholesale_cost .decimal(7,2), - 'ws_ext_list_price .decimal(7,2), - 'ws_ext_tax .decimal(7,2), - 'ws_coupon_amt .decimal(7,2), - 'ws_ext_ship_cost .decimal(7,2), - 'ws_net_paid .decimal(7,2), - 'ws_net_paid_inc_tax .decimal(7,2), - 'ws_net_paid_inc_ship .decimal(7,2), - 'ws_net_paid_inc_ship_tax .decimal(7,2), - 'ws_net_profit .decimal(7,2)), + """ + |`ws_sold_date_sk` INT, + |`ws_sold_time_sk` INT, + |`ws_ship_date_sk` INT, + |`ws_item_sk` INT, + |`ws_bill_customer_sk` INT, + |`ws_bill_cdemo_sk` INT, + |`ws_bill_hdemo_sk` INT, + |`ws_bill_addr_sk` INT, + |`ws_ship_customer_sk` INT, + |`ws_ship_cdemo_sk` INT, + |`ws_ship_hdemo_sk` INT, + |`ws_ship_addr_sk` INT, + |`ws_web_page_sk` INT, + |`ws_web_site_sk` INT, + |`ws_ship_mode_sk` INT, + |`ws_warehouse_sk` INT, + |`ws_promo_sk` INT, + |`ws_order_number` INT, + |`ws_quantity` INT, + |`ws_wholesale_cost` DECIMAL(7,2), + |`ws_list_price` DECIMAL(7,2), + |`ws_sales_price` DECIMAL(7,2), + |`ws_ext_discount_amt` DECIMAL(7,2), + |`ws_ext_sales_price` DECIMAL(7,2), + |`ws_ext_wholesale_cost` DECIMAL(7,2), + |`ws_ext_list_price` DECIMAL(7,2), + |`ws_ext_tax` DECIMAL(7,2), + |`ws_coupon_amt` DECIMAL(7,2), + |`ws_ext_ship_cost` DECIMAL(7,2), + |`ws_net_paid` DECIMAL(7,2), + |`ws_net_paid_inc_tax` DECIMAL(7,2), + |`ws_net_paid_inc_ship` DECIMAL(7,2), + |`ws_net_paid_inc_ship_tax` DECIMAL(7,2), + |`ws_net_profit` DECIMAL(7,2) + """.stripMargin), Table("web_returns", - partitionColumns = "wr_returned_date_sk" ::Nil, - 'wr_returned_date_sk .int, - 'wr_returned_time_sk .int, - 'wr_item_sk .int, - 'wr_refunded_customer_sk .int, - 'wr_refunded_cdemo_sk .int, - 'wr_refunded_hdemo_sk .int, - 'wr_refunded_addr_sk .int, - 'wr_returning_customer_sk .int, - 'wr_returning_cdemo_sk .int, - 'wr_returning_hdemo_sk .int, - 'wr_returning_addr_sk .int, - 'wr_web_page_sk .int, - 'wr_reason_sk .int, - 'wr_order_number .long, - 'wr_return_quantity .int, - 'wr_return_amt .decimal(7,2), - 'wr_return_tax .decimal(7,2), - 'wr_return_amt_inc_tax .decimal(7,2), - 'wr_fee .decimal(7,2), - 'wr_return_ship_cost .decimal(7,2), - 'wr_refunded_cash .decimal(7,2), - 'wr_reversed_charge .decimal(7,2), - 'wr_account_credit .decimal(7,2), - 'wr_net_loss .decimal(7,2)), + partitionColumns = "wr_returned_date_sk" :: Nil, + """ + |`wr_returned_date_sk` INT, + |`wr_returned_time_sk` INT, + |`wr_item_sk` INT, + |`wr_refunded_customer_sk` INT, + |`wr_refunded_cdemo_sk` INT, + |`wr_refunded_hdemo_sk` INT, + |`wr_refunded_addr_sk` INT, + |`wr_returning_customer_sk` INT, + |`wr_returning_cdemo_sk` INT, + |`wr_returning_hdemo_sk` INT, + |`wr_returning_addr_sk` INT, + |`wr_web_page_sk` INT, + |`wr_reason_sk` INT, + |`wr_order_number` INT, + |`wr_return_quantity` INT, + |`wr_return_amt` DECIMAL(7,2), + |`wr_return_tax` DECIMAL(7,2), + |`wr_return_amt_inc_tax` DECIMAL(7,2), + |`wr_fee` DECIMAL(7,2), + |`wr_return_ship_cost` DECIMAL(7,2), + |`wr_refunded_cash` DECIMAL(7,2), + |`wr_reversed_charge` DECIMAL(7,2), + |`wr_account_credit` DECIMAL(7,2), + |`wr_net_loss` DECIMAL(7,2) + """.stripMargin), Table("call_center", partitionColumns = Nil, - 'cc_call_center_sk .int, - 'cc_call_center_id .string, - 'cc_rec_start_date .date, - 'cc_rec_end_date .date, - 'cc_closed_date_sk .int, - 'cc_open_date_sk .int, - 'cc_name .string, - 'cc_class .string, - 'cc_employees .int, - 'cc_sq_ft .int, - 'cc_hours .string, - 'cc_manager .string, - 'cc_mkt_id .int, - 'cc_mkt_class .string, - 'cc_mkt_desc .string, - 'cc_market_manager .string, - 'cc_division .int, - 'cc_division_name .string, - 'cc_company .int, - 'cc_company_name .string, - 'cc_street_number .string, - 'cc_street_name .string, - 'cc_street_type .string, - 'cc_suite_number .string, - 'cc_city .string, - 'cc_county .string, - 'cc_state .string, - 'cc_zip .string, - 'cc_country .string, - 'cc_gmt_offset .decimal(5,2), - 'cc_tax_percentage .decimal(5,2)), + """ + |`cc_call_center_sk` INT, + |`cc_call_center_id` CHAR(16), + |`cc_rec_start_date` DATE, + |`cc_rec_end_date` DATE, + |`cc_closed_date_sk` INT, + |`cc_open_date_sk` INT, + |`cc_name` VARCHAR(50), + |`cc_class` VARCHAR(50), + |`cc_employees` INT, + |`cc_sq_ft` INT, + |`cc_hours` CHAR(20), + |`cc_manager` VARCHAR(40), + |`cc_mkt_id` INT, + |`cc_mkt_class` CHAR(50), + |`cc_mkt_desc` VARCHAR(100), + |`cc_market_manager` VARCHAR(40), + |`cc_division` INT, + |`cc_division_name` VARCHAR(50), + |`cc_company` INT, + |`cc_company_name` CHAR(50), + |`cc_street_number` CHAR(10), + |`cc_street_name` VARCHAR(60), + |`cc_street_type` CHAR(15), + |`cc_suite_number` CHAR(10), + |`cc_city` VARCHAR(60), + |`cc_county` VARCHAR(30), + |`cc_state` CHAR(2), + |`cc_zip` CHAR(10), + |`cc_country` VARCHAR(20), + |`cc_gmt_offset` DECIMAL(5,2), + |`cc_tax_percentage` DECIMAL(5,2) + """.stripMargin), Table("catalog_page", partitionColumns = Nil, - 'cp_catalog_page_sk .int, - 'cp_catalog_page_id .string, - 'cp_start_date_sk .int, - 'cp_end_date_sk .int, - 'cp_department .string, - 'cp_catalog_number .int, - 'cp_catalog_page_number .int, - 'cp_description .string, - 'cp_type .string), + """ + |`cp_catalog_page_sk` INT, + |`cp_catalog_page_id` CHAR(16), + |`cp_start_date_sk` INT, + |`cp_end_date_sk` INT, + |`cp_department` VARCHAR(50), + |`cp_catalog_number` INT, + |`cp_catalog_page_number` INT, + |`cp_description` VARCHAR(100), + |`cp_type` VARCHAR(100) + """.stripMargin), Table("customer", partitionColumns = Nil, - 'c_customer_sk .int, - 'c_customer_id .string, - 'c_current_cdemo_sk .int, - 'c_current_hdemo_sk .int, - 'c_current_addr_sk .int, - 'c_first_shipto_date_sk .int, - 'c_first_sales_date_sk .int, - 'c_salutation .string, - 'c_first_name .string, - 'c_last_name .string, - 'c_preferred_cust_flag .string, - 'c_birth_day .int, - 'c_birth_month .int, - 'c_birth_year .int, - 'c_birth_country .string, - 'c_login .string, - 'c_email_address .string, - 'c_last_review_date .string), + """ + |`c_customer_sk` INT, + |`c_customer_id` CHAR(16), + |`c_current_cdemo_sk` INT, + |`c_current_hdemo_sk` INT, + |`c_current_addr_sk` INT, + |`c_first_shipto_date_sk` INT, + |`c_first_sales_date_sk` INT, + |`c_salutation` CHAR(10), + |`c_first_name` CHAR(20), + |`c_last_name` CHAR(30), + |`c_preferred_cust_flag` CHAR(1), + |`c_birth_day` INT, + |`c_birth_month` INT, + |`c_birth_year` INT, + |`c_birth_country` VARCHAR(20), + |`c_login` CHAR(13), + |`c_email_address` CHAR(50), + |`c_last_review_date` INT + """.stripMargin), Table("customer_address", partitionColumns = Nil, - 'ca_address_sk .int, - 'ca_address_id .string, - 'ca_street_number .string, - 'ca_street_name .string, - 'ca_street_type .string, - 'ca_suite_number .string, - 'ca_city .string, - 'ca_county .string, - 'ca_state .string, - 'ca_zip .string, - 'ca_country .string, - 'ca_gmt_offset .decimal(5,2), - 'ca_location_type .string), + """ + |`ca_address_sk` INT, + |`ca_address_id` CHAR(16), + |`ca_street_number` CHAR(10), + |`ca_street_name` VARCHAR(60), + |`ca_street_type` CHAR(15), + |`ca_suite_number` CHAR(10), + |`ca_city` VARCHAR(60), + |`ca_county` VARCHAR(30), + |`ca_state` CHAR(2), + |`ca_zip` CHAR(10), + |`ca_country` VARCHAR(20), + |`ca_gmt_offset` DECIMAL(5,2), + |`ca_location_type` CHAR(20) + """.stripMargin), Table("customer_demographics", partitionColumns = Nil, - 'cd_demo_sk .int, - 'cd_gender .string, - 'cd_marital_status .string, - 'cd_education_status .string, - 'cd_purchase_estimate .int, - 'cd_credit_rating .string, - 'cd_dep_count .int, - 'cd_dep_employed_count .int, - 'cd_dep_college_count .int), + """ + |`cd_demo_sk` INT, + |`cd_gender` CHAR(1), + |`cd_marital_status` CHAR(1), + |`cd_education_status` CHAR(20), + |`cd_purchase_estimate` INT, + |`cd_credit_rating` CHAR(10), + |`cd_dep_count` INT, + |`cd_dep_employed_count` INT, + |`cd_dep_college_count` INT + """.stripMargin), Table("date_dim", partitionColumns = Nil, - 'd_date_sk .int, - 'd_date_id .string, - 'd_date .date, - 'd_month_seq .int, - 'd_week_seq .int, - 'd_quarter_seq .int, - 'd_year .int, - 'd_dow .int, - 'd_moy .int, - 'd_dom .int, - 'd_qoy .int, - 'd_fy_year .int, - 'd_fy_quarter_seq .int, - 'd_fy_week_seq .int, - 'd_day_name .string, - 'd_quarter_name .string, - 'd_holiday .string, - 'd_weekend .string, - 'd_following_holiday .string, - 'd_first_dom .int, - 'd_last_dom .int, - 'd_same_day_ly .int, - 'd_same_day_lq .int, - 'd_current_day .string, - 'd_current_week .string, - 'd_current_month .string, - 'd_current_quarter .string, - 'd_current_year .string), + """ + |`d_date_sk` INT, + |`d_date_id` CHAR(16), + |`d_date` DATE, + |`d_month_seq` INT, + |`d_week_seq` INT, + |`d_quarter_seq` INT, + |`d_year` INT, + |`d_dow` INT, + |`d_moy` INT, + |`d_dom` INT, + |`d_qoy` INT, + |`d_fy_year` INT, + |`d_fy_quarter_seq` INT, + |`d_fy_week_seq` INT, + |`d_day_name` CHAR(9), + |`d_quarter_name` CHAR(6), + |`d_holiday` CHAR(1), + |`d_weekend` CHAR(1), + |`d_following_holiday` CHAR(1), + |`d_first_dom` INT, + |`d_last_dom` INT, + |`d_same_day_ly` INT, + |`d_same_day_lq` INT, + |`d_current_day` CHAR(1), + |`d_current_week` CHAR(1), + |`d_current_month` CHAR(1), + |`d_current_quarter` CHAR(1), + |`d_current_year` CHAR(1) + """.stripMargin), Table("household_demographics", partitionColumns = Nil, - 'hd_demo_sk .int, - 'hd_income_band_sk .int, - 'hd_buy_potential .string, - 'hd_dep_count .int, - 'hd_vehicle_count .int), + """ + |`hd_demo_sk` INT, + |`hd_income_band_sk` INT, + |`hd_buy_potential` CHAR(15), + |`hd_dep_count` INT, + |`hd_vehicle_count` INT + """.stripMargin), Table("income_band", partitionColumns = Nil, - 'ib_income_band_sk .int, - 'ib_lower_bound .int, - 'ib_upper_bound .int), + """ + |`ib_income_band_sk` INT, + |`ib_lower_bound` INT, + |`ib_upper_bound` INT + """.stripMargin), Table("item", partitionColumns = Nil, - 'i_item_sk .int, - 'i_item_id .string, - 'i_rec_start_date .date, - 'i_rec_end_date .date, - 'i_item_desc .string, - 'i_current_price .decimal(7,2), - 'i_wholesale_cost .decimal(7,2), - 'i_brand_id .int, - 'i_brand .string, - 'i_class_id .int, - 'i_class .string, - 'i_category_id .int, - 'i_category .string, - 'i_manufact_id .int, - 'i_manufact .string, - 'i_size .string, - 'i_formulation .string, - 'i_color .string, - 'i_units .string, - 'i_container .string, - 'i_manager_id .int, - 'i_product_name .string), + """ + |`i_item_sk` INT, + |`i_item_id` CHAR(16), + |`i_rec_start_date` DATE, + |`i_rec_end_date` DATE, + |`i_item_desc` VARCHAR(200), + |`i_current_price` DECIMAL(7,2), + |`i_wholesale_cost` DECIMAL(7,2), + |`i_brand_id` INT, + |`i_brand` CHAR(50), + |`i_class_id` INT, + |`i_class` CHAR(50), + |`i_category_id` INT, + |`i_category` CHAR(50), + |`i_manufact_id` INT, + |`i_manufact` CHAR(50), + |`i_size` CHAR(20), + |`i_formulation` CHAR(20), + |`i_color` CHAR(20), + |`i_units` CHAR(10), + |`i_container` CHAR(10), + |`i_manager_id` INT, + |`i_product_name` CHAR(50) + """.stripMargin), Table("promotion", partitionColumns = Nil, - 'p_promo_sk .int, - 'p_promo_id .string, - 'p_start_date_sk .int, - 'p_end_date_sk .int, - 'p_item_sk .int, - 'p_cost .decimal(15,2), - 'p_response_target .int, - 'p_promo_name .string, - 'p_channel_dmail .string, - 'p_channel_email .string, - 'p_channel_catalog .string, - 'p_channel_tv .string, - 'p_channel_radio .string, - 'p_channel_press .string, - 'p_channel_event .string, - 'p_channel_demo .string, - 'p_channel_details .string, - 'p_purpose .string, - 'p_discount_active .string), + """ + |`p_promo_sk` INT, + |`p_promo_id` CHAR(16), + |`p_start_date_sk` INT, + |`p_end_date_sk` INT, + |`p_item_sk` INT, + |`p_cost` DECIMAL(15,2), + |`p_response_target` INT, + |`p_promo_name` CHAR(50), + |`p_channel_dmail` CHAR(1), + |`p_channel_email` CHAR(1), + |`p_channel_catalog` CHAR(1), + |`p_channel_tv` CHAR(1), + |`p_channel_radio` CHAR(1), + |`p_channel_press` CHAR(1), + |`p_channel_event` CHAR(1), + |`p_channel_demo` CHAR(1), + |`p_channel_details` VARCHAR(100), + |`p_purpose` CHAR(15), + |`p_discount_active` CHAR(1) + """.stripMargin), Table("reason", partitionColumns = Nil, - 'r_reason_sk .int, - 'r_reason_id .string, - 'r_reason_desc .string), + """ + |`r_reason_sk` INT, + |`r_reason_id` CHAR(16), + |`r_reason_desc` CHAR(100) + """.stripMargin), Table("ship_mode", partitionColumns = Nil, - 'sm_ship_mode_sk .int, - 'sm_ship_mode_id .string, - 'sm_type .string, - 'sm_code .string, - 'sm_carrier .string, - 'sm_contract .string), + """ + |`sm_ship_mode_sk` INT, + |`sm_ship_mode_id` CHAR(16), + |`sm_type` CHAR(30), + |`sm_code` CHAR(10), + |`sm_carrier` CHAR(20), + |`sm_contract` CHAR(20) + """.stripMargin), Table("store", partitionColumns = Nil, - 's_store_sk .int, - 's_store_id .string, - 's_rec_start_date .date, - 's_rec_end_date .date, - 's_closed_date_sk .int, - 's_store_name .string, - 's_number_employees .int, - 's_floor_space .int, - 's_hours .string, - 's_manager .string, - 's_market_id .int, - 's_geography_class .string, - 's_market_desc .string, - 's_market_manager .string, - 's_division_id .int, - 's_division_name .string, - 's_company_id .int, - 's_company_name .string, - 's_street_number .string, - 's_street_name .string, - 's_street_type .string, - 's_suite_number .string, - 's_city .string, - 's_county .string, - 's_state .string, - 's_zip .string, - 's_country .string, - 's_gmt_offset .decimal(5,2), - 's_tax_precentage .decimal(5,2)), + """ + |`s_store_sk` INT, + |`s_store_id` CHAR(16), + |`s_rec_start_date` DATE, + |`s_rec_end_date` DATE, + |`s_closed_date_sk` INT, + |`s_store_name` VARCHAR(50), + |`s_number_employees` INT, + |`s_floor_space` INT, + |`s_hours` CHAR(20), + |`s_manager` VARCHAR(40), + |`s_market_id` INT, + |`s_geography_class` VARCHAR(100), + |`s_market_desc` VARCHAR(100), + |`s_market_manager` VARCHAR(40), + |`s_division_id` INT, + |`s_division_name` VARCHAR(50), + |`s_company_id` INT, + |`s_company_name` VARCHAR(50), + |`s_street_number` VARCHAR(10), + |`s_street_name` VARCHAR(60), + |`s_street_type` CHAR(15), + |`s_suite_number` CHAR(10), + |`s_city` VARCHAR(60), + |`s_county` VARCHAR(30), + |`s_state` CHAR(2), + |`s_zip` CHAR(10), + |`s_country` VARCHAR(20), + |`s_gmt_offset` DECIMAL(5,2), + |`s_tax_percentage` DECIMAL(5,2) + """.stripMargin), Table("time_dim", partitionColumns = Nil, - 't_time_sk .int, - 't_time_id .string, - 't_time .int, - 't_hour .int, - 't_minute .int, - 't_second .int, - 't_am_pm .string, - 't_shift .string, - 't_sub_shift .string, - 't_meal_time .string), + """ + |`t_time_sk` INT, + |`t_time_id` CHAR(16), + |`t_time` INT, + |`t_hour` INT, + |`t_minute` INT, + |`t_second` INT, + |`t_am_pm` CHAR(2), + |`t_shift` CHAR(20), + |`t_sub_shift` CHAR(20), + |`t_meal_time` CHAR(20) + """.stripMargin), Table("warehouse", partitionColumns = Nil, - 'w_warehouse_sk .int, - 'w_warehouse_id .string, - 'w_warehouse_name .string, - 'w_warehouse_sq_ft .int, - 'w_street_number .string, - 'w_street_name .string, - 'w_street_type .string, - 'w_suite_number .string, - 'w_city .string, - 'w_county .string, - 'w_state .string, - 'w_zip .string, - 'w_country .string, - 'w_gmt_offset .decimal(5,2)), + """ + |`w_warehouse_sk` INT, + |`w_warehouse_id` CHAR(16), + |`w_warehouse_name` VARCHAR(20), + |`w_warehouse_sq_ft` INT, + |`w_street_number` CHAR(10), + |`w_street_name` VARCHAR(20), + |`w_street_type` CHAR(15), + |`w_suite_number` CHAR(10), + |`w_city` VARCHAR(60), + |`w_county` VARCHAR(30), + |`w_state` CHAR(2), + |`w_zip` CHAR(10), + |`w_country` VARCHAR(20), + |`w_gmt_offset` DECIMAL(5,2) + """.stripMargin), Table("web_page", partitionColumns = Nil, - 'wp_web_page_sk .int, - 'wp_web_page_id .string, - 'wp_rec_start_date .date, - 'wp_rec_end_date .date, - 'wp_creation_date_sk .int, - 'wp_access_date_sk .int, - 'wp_autogen_flag .string, - 'wp_customer_sk .int, - 'wp_url .string, - 'wp_type .string, - 'wp_char_count .int, - 'wp_link_count .int, - 'wp_image_count .int, - 'wp_max_ad_count .int), + """ + |`wp_web_page_sk` INT, + |`wp_web_page_id` CHAR(16), + |`wp_rec_start_date` DATE, + |`wp_rec_end_date` DATE, + |`wp_creation_date_sk` INT, + |`wp_access_date_sk` INT, + |`wp_autogen_flag` CHAR(1), + |`wp_customer_sk` INT, + |`wp_url` VARCHAR(100), + |`wp_type` CHAR(50), + |`wp_char_count` INT, + |`wp_link_count` INT, + |`wp_image_count` INT, + |`wp_max_ad_count` INT + """.stripMargin), Table("web_site", partitionColumns = Nil, - 'web_site_sk .int, - 'web_site_id .string, - 'web_rec_start_date .date, - 'web_rec_end_date .date, - 'web_name .string, - 'web_open_date_sk .int, - 'web_close_date_sk .int, - 'web_class .string, - 'web_manager .string, - 'web_mkt_id .int, - 'web_mkt_class .string, - 'web_mkt_desc .string, - 'web_market_manager .string, - 'web_company_id .int, - 'web_company_name .string, - 'web_street_number .string, - 'web_street_name .string, - 'web_street_type .string, - 'web_suite_number .string, - 'web_city .string, - 'web_county .string, - 'web_state .string, - 'web_zip .string, - 'web_country .string, - 'web_gmt_offset .decimal(5,2), - 'web_tax_percentage .decimal(5,2)) + """ + |`web_site_sk` INT, + |`web_site_id` CHAR(16), + |`web_rec_start_date` DATE, + |`web_rec_end_date` DATE, + |`web_name` VARCHAR(50), + |`web_open_date_sk` INT, + |`web_close_date_sk` INT, + |`web_class` VARCHAR(50), + |`web_manager` VARCHAR(40), + |`web_mkt_id` INT, + |`web_mkt_class` VARCHAR(50), + |`web_mkt_desc` VARCHAR(100), + |`web_market_manager` VARCHAR(40), + |`web_company_id` INT, + |`web_company_name` CHAR(50), + |`web_street_number` CHAR(10), + |`web_street_name` VARCHAR(60), + |`web_street_type` CHAR(15), + |`web_suite_number` CHAR(10), + |`web_city` VARCHAR(60), + |`web_county` VARCHAR(30), + |`web_state` CHAR(2), + |`web_zip` CHAR(10), + |`web_country` VARCHAR(20), + |`web_gmt_offset` DECIMAL(5,2), + |`web_tax_percentage` DECIMAL(5,2) + """.stripMargin) ).map(_.convertTypes()) }