diff --git a/modules/glue-catalog-database/README.md b/modules/glue-catalog-database/README.md new file mode 100644 index 0000000..03a0f24 --- /dev/null +++ b/modules/glue-catalog-database/README.md @@ -0,0 +1,37 @@ +# glue-catalog-database + +Terraform module to provision AWS Glue Catalog Databases. + +## Usage + +```hcl +module "s3_bucket_source" { + source = "cloudposse/s3-bucket/aws" + version = "2.0.3" + + acl = "private" + versioning_enabled = false + force_destroy = true + allow_encrypted_uploads_only = true + allow_ssl_requests_only = true + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true + + attributes = ["source"] + context = module.this.context +} + +module "glue_catalog_database" { + source = "cloudposse/glue/aws//modules/glue-catalog-database" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + catalog_database_name = "analytics" + catalog_database_description = "Glue Catalog database using data located in an S3 bucket" + location_uri = format("s3://%s", module.s3_bucket_source.bucket_id) + + context = module.this.context +} +``` diff --git a/modules/glue-catalog-table/README.md b/modules/glue-catalog-table/README.md new file mode 100644 index 0000000..aca6863 --- /dev/null +++ b/modules/glue-catalog-table/README.md @@ -0,0 +1,95 @@ +# glue-catalog-table + +Terraform module to provision AWS Glue Catalog Tables. + +## Usage + +```hcl +module "s3_bucket_source" { + source = "cloudposse/s3-bucket/aws" + version = "2.0.3" + + acl = "private" + versioning_enabled = false + force_destroy = true + allow_encrypted_uploads_only = true + allow_ssl_requests_only = true + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true + + attributes = ["source"] + context = module.this.context +} + +module "glue_catalog_database" { + source = "cloudposse/glue/aws//modules/glue-catalog-database" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + catalog_database_name = "analytics" + catalog_database_description = "Glue Catalog database using data located in an S3 bucket" + location_uri = format("s3://%s", module.s3_bucket_source.bucket_id) + + context = module.this.context +} + +module "glue_catalog_table" { + source = "cloudposse/glue/aws//modules/glue-catalog-table" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + catalog_table_name = "geo" + catalog_table_description = "region/state/county Glue Catalog table" + database_name = module.glue_catalog_database.name + + parameters = { + "lakeformation.aso.status" = true + "classification" = "parquet" + } + + storage_descriptor = { + # List of reducer grouping columns, clustering columns, and bucketing columns in the table + bucket_columns = null + # Configuration block for columns in the table + columns = [ + { + name = "county", + type = "string" + }, + { + name = "state", + type = "string" + }, + { + name = "region", + type = "string" + } + ] + # Whether the data in the table is compressed + compressed = false + # Input format: SequenceFileInputFormat (binary), or TextInputFormat, or a custom format + input_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat" + # Physical location of the table. By default this takes the form of the warehouse location, followed by the database location in the warehouse, followed by the table name + location = format("s3://%s/geo", module.s3_bucket_source.bucket_id) + # Must be specified if the table contains any dimension columns + number_of_buckets = 0 + # Output format: SequenceFileOutputFormat (binary), or IgnoreKeyTextOutputFormat, or a custom format + output_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat" + # Configuration block for serialization and deserialization ("SerDe") information + ser_de_info = { + # Map of initialization parameters for the SerDe, in key-value form + parameters = { + "serialization.format" = "1" + } + # Usually the class that implements the SerDe. An example is org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + serialization_library = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe" + } + # Whether the table data is stored in subdirectories + stored_as_sub_directories = false + } + + context = module.this.context +} +``` diff --git a/modules/glue-connection/README.md b/modules/glue-connection/README.md new file mode 100644 index 0000000..397ef07 --- /dev/null +++ b/modules/glue-connection/README.md @@ -0,0 +1,63 @@ +# glue-workflow + +Terraform module to provision AWS Glue Connections. + +## Usage + +```hcl +module "vpc" { + source = "cloudposse/vpc/aws" + version = "1.1.0" + + ipv4_primary_cidr_block = "172.19.0.0/16" + + context = module.this.context +} + +data "aws_subnet" "selected" { + id = module.vpc.private_subnet_ids[0] +} + +module "security_group" { + source = "cloudposse/security-group/aws" + version = "1.0.1" + + vpc_id = module.vpc.vpc_id + create_before_destroy = true + allow_all_egress = true + + rules = [ + { + type = "ingress" + from_port = 5432 + to_port = 5432 + protocol = "all" + cidr_blocks = [module.vpc.vpc_cidr_block] + } + ] + + context = module.this.context +} + +module "glue_connection" { + source = "cloudposse/glue/aws//modules/glue-connection" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + connection_name = "geo" + connection_description = "Glue connection to Postgres database" + connection_type = "JDBC" + connection_properties = {} + + physical_connection_requirements = { + # List of security group IDs used by the connection + security_group_id_list = [module.security_group.id] + # The availability zone of the connection. This field is redundant and implied by subnet_id, but is currently an API requirement + availability_zone = data.aws_subnet.selected.availability_zone + # The subnet ID used by the connection + subnet_id = module.vpc.private_subnet_ids[0] + } + + context = module.this.context +} +``` diff --git a/modules/glue-crawler/README.md b/modules/glue-crawler/README.md new file mode 100644 index 0000000..b3eb297 --- /dev/null +++ b/modules/glue-crawler/README.md @@ -0,0 +1,112 @@ +# glue-crawler + +Terraform module to provision AWS Glue Crawlers. + +## Usage + +```hcl +module "s3_bucket_source" { + source = "cloudposse/s3-bucket/aws" + version = "2.0.3" + + acl = "private" + versioning_enabled = false + force_destroy = true + allow_encrypted_uploads_only = true + allow_ssl_requests_only = true + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true + + attributes = ["source"] + context = module.this.context +} + +module "iam_role" { + source = "cloudposse/iam-role/aws" + version = "0.16.2" + + principals = { + "Service" = ["glue.amazonaws.com"] + } + + managed_policy_arns = [ + "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole" + ] + + policy_document_count = 0 + policy_description = "Policy for AWS Glue with access to EC2, S3, and Cloudwatch Logs" + role_description = "Role for AWS Glue with access to EC2, S3, and Cloudwatch Logs" + + context = module.this.context +} + +module "s3_bucket_destination" { + source = "cloudposse/s3-bucket/aws" + version = "2.0.3" + + acl = "private" + versioning_enabled = false + force_destroy = true + allow_encrypted_uploads_only = true + allow_ssl_requests_only = true + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true + + attributes = ["destination"] + context = module.this.context +} + +module "glue_catalog_database" { + source = "cloudposse/glue/aws//modules/glue-catalog-database" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + catalog_database_name = "analytics" + catalog_database_description = "Glue Catalog database using data located in an S3 bucket" + location_uri = format("s3://%s", module.s3_bucket_source.bucket_id) + + context = module.this.context +} + +module "glue_crawler" { + source = "cloudposse/glue/aws//modules/glue-crawler" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + crawler_description = "Glue crawler that processes data in the source S3 bucket and writes the result into the destination S3 bucket" + database_name = module.glue_catalog_database.name + role = module.iam_role.arn + schedule = "cron(0 1 * * ? *)" + + schema_change_policy = { + delete_behavior = "LOG" + update_behavior = null + } + + s3_target = [ + { + path = format("s3://%s", module.s3_bucket_destination.bucket_id) + } + ] + + configuration = jsonencode( + { + Grouping = { + TableGroupingPolicy = "CombineCompatibleSchemas" + } + CrawlerOutput = { + Partitions = { + AddOrUpdateBehavior = "InheritFromTable" + } + } + Version = 1 + } + ) + + context = module.this.context +} +``` diff --git a/modules/glue-job/README.md b/modules/glue-job/README.md new file mode 100644 index 0000000..7189cac --- /dev/null +++ b/modules/glue-job/README.md @@ -0,0 +1,70 @@ +# glue-job + +Terraform module to provision AWS Glue Jobs. + +## Usage + +```hcl +module "iam_role" { + source = "cloudposse/iam-role/aws" + version = "0.16.2" + + principals = { + "Service" = ["glue.amazonaws.com"] + } + + managed_policy_arns = [ + "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole" + ] + + policy_document_count = 0 + policy_description = "Policy for AWS Glue with access to EC2, S3, and Cloudwatch Logs" + role_description = "Role for AWS Glue with access to EC2, S3, and Cloudwatch Logs" + + context = module.this.context +} + +module "s3_bucket_job_source" { + source = "cloudposse/s3-bucket/aws" + version = "2.0.3" + + acl = "private" + versioning_enabled = false + force_destroy = true + allow_encrypted_uploads_only = true + allow_ssl_requests_only = true + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true + + context = module.this.context +} + +module "glue_job" { + source = "cloudposse/glue/aws//modules/glue-job" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + job_name = "geo_processor" + job_description = "Glue Job for processing geo data" + role_arn = module.iam_role.arn + glue_version = "2.0" + default_arguments = {} + + worker_type = "Standard" + number_of_workers = 2 + max_retries = 2 + + # The job timeout in minutes + timeout = 20 + + command = { + name = "Run Python script" + script_location = format("s3://%s/geo.py", module.s3_bucket_job_source.bucket_id) + python_version = 3 + } + + context = module.this.context +} +``` diff --git a/modules/glue-registry/README.md b/modules/glue-registry/README.md new file mode 100644 index 0000000..5b07569 --- /dev/null +++ b/modules/glue-registry/README.md @@ -0,0 +1,16 @@ +# glue-registry + +Terraform module to provision AWS Glue Registries. + +## Usage + +```hcl +module "glue_registry" { + source = "cloudposse/glue/aws//modules/glue-registry" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + registry_name = "analytics" + registry_description = "Glue Registry for analytics" +} +``` diff --git a/modules/glue-schema/README.md b/modules/glue-schema/README.md new file mode 100644 index 0000000..4a11797 --- /dev/null +++ b/modules/glue-schema/README.md @@ -0,0 +1,29 @@ +# glue-schema + +Terraform module to provision AWS Glue Schemas. + +## Usage + +```hcl +module "glue_registry" { + source = "cloudposse/glue/aws//modules/glue-registry" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + registry_name = "analytics" + registry_description = "Glue Registry for analytics" +} + +module "glue_schema" { + source = "cloudposse/glue/aws//modules/glue-schema" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + schema_name = "geo" + schema_description = "Glue Schema for geo data" + registry_arn = module.glue_registry.arn + compatibility = "NONE" + data_format = "JSON" + schema_definition = "{\"type\": \"record\", \"name\": \"geo\", \"fields\": [ {\"name\": \"state\", \"type\": \"string\"}, {\"name\": \"city\", \"type\": \"string\"} ]}" +} +``` diff --git a/modules/glue-trigger/README.md b/modules/glue-trigger/README.md new file mode 100644 index 0000000..b9bee09 --- /dev/null +++ b/modules/glue-trigger/README.md @@ -0,0 +1,104 @@ +# glue-trigger + +Terraform module to provision AWS Glue Triggers. + +## Usage + +```hcl +module "iam_role" { + source = "cloudposse/iam-role/aws" + version = "0.16.2" + + principals = { + "Service" = ["glue.amazonaws.com"] + } + + managed_policy_arns = [ + "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole" + ] + + policy_document_count = 0 + policy_description = "Policy for AWS Glue with access to EC2, S3, and Cloudwatch Logs" + role_description = "Role for AWS Glue with access to EC2, S3, and Cloudwatch Logs" + + context = module.this.context +} + +module "s3_bucket_job_source" { + source = "cloudposse/s3-bucket/aws" + version = "2.0.3" + + acl = "private" + versioning_enabled = false + force_destroy = true + allow_encrypted_uploads_only = true + allow_ssl_requests_only = true + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true + + context = module.this.context +} + +module "glue_workflow" { + source = "cloudposse/glue/aws//modules/glue-workflow" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + workflow_name = "geo" + workflow_description = "Glue workflow to process geo data" + max_concurrent_runs = 2 + default_run_properties = {} +} + +module "glue_job" { + source = "cloudposse/glue/aws//modules/glue-job" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + job_name = "geo_processor" + job_description = "Glue Job for processing geo data" + role_arn = module.iam_role.arn + glue_version = "2.0" + default_arguments = {} + + worker_type = "Standard" + number_of_workers = 2 + max_retries = 2 + + # The job timeout in minutes + timeout = 20 + + command = { + name = "Run Python script" + script_location = format("s3://%s/geo.py", module.s3_bucket_job_source.bucket_id) + python_version = 3 + } + + context = module.this.context +} + +module "glue_trigger" { + source = "cloudposse/glue/aws//modules/glue-trigger" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + workflow_name = module.glue_workflow.name + trigger_enabled = true + start_on_creation = true + trigger_description = "Glue Trigger that triggers the geo_processor Glue Job on a schedule" + schedule = "cron(15 12 * * ? *)" + type = "SCHEDULED" + + actions = [ + { + job_name = module.glue_job.name + # The job run timeout in minutes. It overrides the timeout value of the job + timeout = 10 + } + ] + + context = module.this.context +} +``` diff --git a/modules/glue-workflow/README.md b/modules/glue-workflow/README.md new file mode 100644 index 0000000..dc00f2e --- /dev/null +++ b/modules/glue-workflow/README.md @@ -0,0 +1,18 @@ +# glue-workflow + +Terraform module to provision AWS Glue Workflows. + +## Usage + +```hcl +module "glue_workflow" { + source = "cloudposse/glue/aws//modules/glue-workflow" + # Cloud Posse recommends pinning every module to a specific version + # version = "x.x.x" + + workflow_name = "geo" + workflow_description = "Glue workflow to process geo data" + max_concurrent_runs = 2 + default_run_properties = {} +} +```