diff --git a/formats/ProtoBuf/README.md b/formats/ProtoBuf/README.md new file mode 100644 index 00000000..08bb6a67 --- /dev/null +++ b/formats/ProtoBuf/README.md @@ -0,0 +1,130 @@ +# Generating `protobuf_message.bin` + +We'll use Python to serialize some data into `protobuf_message.bin`. + +Run the following command to generate a Python file named `schemafile_pb2.py` in the same directory as `schemafile.proto`. +This file contains the Python classes that represent your `MessageType` Protobuf message: + +``` +protoc --python_out=. schemafile.proto +``` + +Now, create a new Python file named `generate_protobuf_data.py`, in the same directory as `schemafile_pb2.py`. Paste the following code into it: + +``` +import schemafile_pb2 # Module generated by 'protoc' +from google.protobuf import text_format +from google.protobuf.internal.encoder import _VarintBytes # Import the internal varint encoder + +def create_user_data_message(name, surname, birthDate, phoneNumbers): + """ + Creates and populates a UserData Protobuf message. + """ + message = schemafile_pb2.MessageType() + message.name = name + message.surname = surname + message.birthDate = birthDate + message.phoneNumbers.extend(phoneNumbers) + return message + +# The data for our example users +data_to_serialize = [ + {"name": "Aisha", "surname": "Khan", "birthDate": 19920815, "phoneNumbers": ["(555) 247-8903", "(555) 612-3457"]}, + {"name": "Javier", "surname": "Rodriguez", "birthDate": 20001015, "phoneNumbers": ["(555) 891-2046", "(555) 738-5129"]}, + {"name": "Mei", "surname": "Ling", "birthDate": 19980616, "phoneNumbers": ["(555) 956-1834", "(555) 403-7682"]}, +] + +output_filename = "protobuf_messages.bin" + +# Open the binary file in write-binary mode ('wb') +with open(output_filename, "wb") as f: + for item in data_to_serialize: + # Create a Protobuf message instance for the current user + message = create_user_data_message( + item["name"], + item["surname"], + item["birthDate"], + item["phoneNumbers"] + ) + + # Serialize the message + serialized_data = message.SerializeToString() + + # Get the length of the serialized data + message_length = len(serialized_data) + + # Use the Protobuf library's internal _VarintBytes to encode the length + length_prefix = _VarintBytes(message_length) + + # Write the length prefix + f.write(length_prefix) + # Write the serialized message data + f.write(serialized_data) + +print(f"Protobuf messages (length-delimited) written to {output_filename}") + +# --- Optional: Verification (reading back and printing) --- +# For reading back, we'll also use the internal Protobuf decoder for varints. +from google.protobuf.internal.decoder import _DecodeVarint32 + +print("\n--- Verifying by reading back ---") +with open(output_filename, "rb") as f: + buf = f.read() # Read the whole file into a buffer for easier varint decoding + n = 0 + while n < len(buf): + # Decode the varint length prefix + msg_len, new_pos = _DecodeVarint32(buf, n) + n = new_pos + + # Extract the message data + message_data = buf[n:n+msg_len] + n += msg_len + + # Parse the message + decoded_message = schemafile_pb2.MessageType() + decoded_message.ParseFromString(message_data) + print(text_format.MessageToString(decoded_message, as_utf8=True)) +``` + +Now run the script from the command line. It is recommended to run it from a python virtual environment, for example using `uv`: + +``` +uv venv proto-venv +source proto-venv/bin/activate +``` + +You will need to install the following python libraries: + +``` +uv pip install --upgrade protobuf +``` + +``` +python generate_protobuf_data.py +``` + +Create a ClickHouse table matching the schema: + +``` +CREATE DATABASE IF NOT EXISTS test; +CREATE TABLE IF NOT EXISTS test.protobuf_messages ( + name String, + surname String, + birthDate UInt32, + phoneNumbers Array(String) +) +ENGINE = MergeTree() +ORDER BY tuple() +``` + +Insert the data into ClickHouse: + +``` +cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.protobuf_messages SETTINGS format_schema='schemafile:MessageType' FORMAT Protobuf" +``` + +You can now read the data back: + +``` +SELECT * FROM test.protobuf_messages FORMAT Protobuf SETTINGS format_schema = 'schemafile:MessageType' +``` diff --git a/formats/ProtoBuf/generate_protobuf_data.py b/formats/ProtoBuf/generate_protobuf_data.py new file mode 100644 index 00000000..d9bb062c --- /dev/null +++ b/formats/ProtoBuf/generate_protobuf_data.py @@ -0,0 +1,36 @@ +import schemafile_pb2 # Module generated by 'protoc' +from google.protobuf import text_format + +def create_user_data_message(name, surname, age, emails): # Modified arguments + """ + Creates and populates a UserData Protobuf message. + """ + user = schemafile_pb2.UserData() + user.name = name + user.surname = surname + user.age = age + user.email_addresses.extend(emails) + return user + +# The data for our example users +data_to_serialize = [ + {"name": "Aisha", "surname": "Khan", "age": 28, "emails": ["aisha.khan@example.com", "aisha.k@mymail.net"]}, + {"name": "Javier", "surname": "Rodriguez", "age": 35, "emails": ["javier.rodriguez@company.org", "j.rodriguez@email.co"]}, + {"name": "Mei", "surname": "Ling", "age": 22, "emails": ["mei.ling@webmail.com", "m.ling@personal.dev"]}, +] + +output_filename = "protobuf_messages.bin" + +# Open the binary file in write-binary mode ('wb') +with open(output_filename, "wb") as f: + for item in data_to_serialize: + # Create a Protobuf message instance for the current user + message = create_user_data_message( + item["name"], + item["surname"], + item["age"], + item["emails"] + ) + + # Serialize the message and add the length prefix + f.write(message.SerializeDelimitedToString()) diff --git a/formats/ProtoBuf/protobuf_messages.bin b/formats/ProtoBuf/protobuf_messages.bin new file mode 100644 index 00000000..bf84e5cb --- /dev/null +++ b/formats/ProtoBuf/protobuf_messages.bin @@ -0,0 +1,4 @@ +2 +AishaKhan¯ï¿ "(555) 247-8903"(555) 612-34578 +Javier Rodriguez÷áÄ "(555) 891-2046"(555) 738-51290 +MeiLingÈÂà "(555) 956-1834"(555) 403-7682 \ No newline at end of file diff --git a/formats/ProtoBuf/schemafile.proto b/formats/ProtoBuf/schemafile.proto new file mode 100644 index 00000000..e9dac7d8 --- /dev/null +++ b/formats/ProtoBuf/schemafile.proto @@ -0,0 +1,8 @@ +syntax = "proto3"; + +message MessageType { + string name = 1; + string surname = 2; + uint32 birthDate = 3; + repeated string phoneNumbers = 4; +}; diff --git a/formats/ProtoBuf/schemafile_pb2.py b/formats/ProtoBuf/schemafile_pb2.py new file mode 100644 index 00000000..5bf67cd3 --- /dev/null +++ b/formats/ProtoBuf/schemafile_pb2.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: schemafile.proto +# Protobuf Python Version: 5.29.3 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 29, + 3, + '', + 'schemafile.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10schemafile.proto\"O\n\x08UserData\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07surname\x18\x02 \x01(\t\x12\x0b\n\x03\x61ge\x18\x03 \x01(\r\x12\x17\n\x0f\x65mail_addresses\x18\x04 \x03(\tb\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'schemafile_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals['_USERDATA']._serialized_start=20 + _globals['_USERDATA']._serialized_end=99 +# @@protoc_insertion_point(module_scope)