Skip to content

in_tail: process non utf8 encodings with conversion engine #10542

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions plugins/in_tail/tail.c
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,15 @@ static struct flb_config_map config_map[] = {
"Currently, UTF-16LE, UTF-16BE, auto are supported.",
},
#endif
{
FLB_CONFIG_MAP_STR, "generic.encoding", NULL,
0, FLB_FALSE, 0,
"specify the preferred input encoding for converting to UTF-8. "
"Currently, the following encodings are supported: "
"ShiftJIS, UHC, GBK, GB18030, Big5, "
"Win866, Win874, "
"Win1250, Win1251, Win1252, Win2513, Win1254, Win1255, WIn1256",
},
/* EOF */
{0}
};
Expand Down
17 changes: 15 additions & 2 deletions plugins/in_tail/tail_config.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@
#include "tail_multiline.h"
#endif

#ifdef FLB_HAVE_UNICODE_ENCODER
#include <fluent-bit/flb_unicode.h>
#endif

static int multiline_load_parsers(struct flb_tail_config *ctx)
{
Expand Down Expand Up @@ -114,6 +112,7 @@ struct flb_tail_config *flb_tail_config_create(struct flb_input_instance *ins,
#ifdef FLB_HAVE_UNICODE_ENCODER
ctx->preferred_input_encoding = FLB_UNICODE_ENCODING_UNSPECIFIED;
#endif
ctx->generic_input_encoding_type = FLB_GENERIC_UNSPECIFIED; /* Default is unspecified */

/* Load the config map */
ret = flb_input_config_map_set(ins, (void *) ctx);
Expand Down Expand Up @@ -222,6 +221,20 @@ struct flb_tail_config *flb_tail_config_create(struct flb_input_instance *ins,
}
#endif

tmp = flb_input_get_property("generic.encoding", ins);
if (tmp) {
ret = flb_unicode_generic_select_encoding_type(tmp);
if (ret != FLB_GENERIC_UNSPECIFIED) {
ctx->generic_input_encoding_type = ret;
ctx->generic_input_encoding_name = tmp;
}
else {
flb_plg_error(ctx->ins, "invalid encoding 'generic.encoding' value %s", tmp);
flb_free(ctx);
return NULL;
}
}

#ifdef FLB_HAVE_PARSER
/* Config: multi-line support */
if (ctx->multiline == FLB_TRUE) {
Expand Down
2 changes: 2 additions & 0 deletions plugins/in_tail/tail_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ struct flb_tail_config {
#ifdef FLB_HAVE_UNICODE_ENCODER
int preferred_input_encoding;
#endif
int generic_input_encoding_type;
const char *generic_input_encoding_name;

/* Multiline */
int multiline; /* multiline enabled ? */
Expand Down
18 changes: 15 additions & 3 deletions plugins/in_tail/tail_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@
#include "win32.h"
#endif

#ifdef FLB_HAVE_UNICODE_ENCODER
#include <fluent-bit/flb_unicode.h>
#endif

#include <cfl/cfl.h>

Expand Down Expand Up @@ -445,8 +443,8 @@ static int process_content(struct flb_tail_file *file, size_t *bytes)
time_t now = time(NULL);
struct flb_time out_time = {0};
struct flb_tail_config *ctx;
#ifdef FLB_HAVE_UNICODE_ENCODER
char *decoded = NULL;
#ifdef FLB_HAVE_UNICODE_ENCODER
size_t decoded_len;
#endif

Expand Down Expand Up @@ -485,6 +483,20 @@ static int process_content(struct flb_tail_file *file, size_t *bytes)
}
}
#endif
if (ctx->generic_input_encoding_type != FLB_GENERIC_UNSPECIFIED) {
original_len = end - data;
decoded = NULL;
ret = flb_unicode_generic_convert_to_utf8(ctx->generic_input_encoding_name,
(unsigned char*)data, (unsigned char**)&decoded,
end - data);
if (ret > 0) {
data = decoded;
end = data + strlen(decoded);
}
else {
flb_plg_error(ctx->ins, "encoding failed '%.*s' with status %d", end - data, data, ret);
}
}

/* Skip null characters from the head (sometimes introduced by copy-truncate log rotation) */
while (data < end && *data == '\0') {
Expand Down
199 changes: 199 additions & 0 deletions tests/runtime/data/tail/generate_generic_encoder_testing_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# -*- coding: utf-8 -*-

import os

# This script generates a set of text files for testing various character encodings.
# Each file contains a curated list of common, neutral words appropriate for the
# target language and encoding.
#
# The word lists specifically exclude:
# - Religious terminology
# - Names of capital cities
#
# To use this script:
# 1. Save it as a Python file (e.g., `generate_files.py`).
# 2. Run it from your terminal: `python generate_files.py`
# 3. The script will create several .txt files in the same directory.

# Dictionary of encodings and their corresponding test data.
# The keys are the encoding names (and will be used in the filenames).
# The values are lists of strings to be written to the files.
ENCODING_DATA = {
# --- East Asian Encodings ---
"sjis": [
"こんにちは", # Hello
"ありがとう", # Thank you
"さようなら", # Goodbye
"日本", # Japan
"猫", # Cat
"犬", # Dog
"食べる", # To eat
"飲む", # To drink
"空", # Sky
"æµ·", # Sea
"月", # Moon
"花", # Flower
],
"big5": [
"你好", # Hello
"謝謝", # Thank you
"再見", # Goodbye
"貓", # Cat
"ç‹—", # Dog
"吃", # To eat
"喝", # To drink
"天", # Sky
"æµ·", # Sea
"月亮", # Moon
"花卉", # Flower
],
"gbk": [
"你好", # Hello
"谢谢", # Thank you
"再见", # Goodbye
"中国", # China
"猫", # Cat
"ç‹—", # Dog
"吃", # To eat
"喝", # To drink
"天", # Sky
"æµ·", # Sea
"月亮", # Moon
"花", # Flower
],
"gb18030": [ # Superset of GBK, can include the same + more
"你好", "谢谢", "再见", "中国", "猫", "狗", "吃", "喝", "天", "海",
"欧元符号€", # Euro symbol to test expanded range
"龘", "龍", # Complex characters
],
"euc-kr": [ # Often used for Korean, UHC is a Microsoft equivalent
"안녕하세요", # Hello
"감사합니다", # Thank you
"안녕히 가세요",# Goodbye
"한국", # Korea
"고양이", # Cat
"개", # Dog
"먹다", # To eat
"마시다", # To drink
"하늘", # Sky
"바다", # Sea
"달", # Moon
"꽃", # Flower
],

# --- Windows Codepage Encodings ---
"cp866": [ # Cyrillic (DOS)
"Привет", # Hello
"Спасибо", # Thank you
"До свидания", # Goodbye
"Компьютер", # Computer
"Информация", # Information
"Программа", # Program
"Файл", # File
],
"cp874": [ # Thai
"สวัสดี", # Hello
"ขอบคุณ", # Thank you
"ลาก่อน", # Goodbye
"ภาษาไทย", # Thai language
"แมว", # Cat
"สุนัข", # Dog
"กิน", # Eat
"ดื่ม", # Drink
],
"cp1250": [ # Central European (Polish, Czech, etc.)
"Cześć", "Dziękuję", # Polish
"Ahoj", "Děkuji", # Czech
"Žluťoučký kůň", # Czech phrase with diacritics
"Gęślą jaźń", # Polish phrase with diacritics
"Árvíztűrő tükörfúrógép", # Hungarian
],
"cp1251": [ # Cyrillic (Windows)
"Привет", "Спасибо", "До свидания",
"Кошка", "Собака", "Небо", "Море",
"Български език", # Bulgarian
"Українська мова",# Ukrainian
"Беларуская мова",# Belarusian
],
"cp1252": [ # Western European
"Hello", "Thank you", "Goodbye", # English
"Bonjour", "Merci", "Au revoir", # French
"Hallo", "Danke", "Auf Wiedersehen", # German
"Hola", "Gracias", "Adiós", # Spanish
"Crème brûlée", "Piñata", "Fjord",
],
"cp1253": [ # Greek
"Γειά σου", # Hello
"Ευχαριστώ", # Thank you
"Αντίο", # Goodbye
"Ελληνικά", # Greek
"Γάτα", # Cat
"Σκύλος", # Dog
"Ουρανός", # Sky
"Θάλασσα", # Sea
],
"cp1254": [ # Turkish
"Merhaba", "Teşekkür ederim", "Hoşça kal",
"Türkiye", "Kedi", "Köpek",
"Yemek", "İçmek", "Gök", "Deniz",
"Öğrenci", "Işık", "Ağaç", # Words with specific Turkish chars
],
"cp1255": [ # Hebrew
"שלום", # Hello/Peace
"תודה", # Thank you
"להתראות", # Goodbye
"עברית", # Hebrew
"חתול", # Cat
"כלב", # Dog
"שמיים", # Sky
"ים", # Sea
],
"cp1256": [ # Arabic
"مرحبا", # Hello
"شكرا", # Thank you
"مع السلامة", # Goodbye
"العربية", # Arabic
"قط", # Cat
"كلب", # Dog
"سماء", # Sky
"بحر", # Sea
],
}

def generate_files():
"""
Iterates through the ENCODING_DATA dictionary and creates a file for each entry.
"""
# Get the directory where the script is running to save files there.
output_dir = os.path.dirname(os.path.abspath(__file__))
print(f"Files will be generated in: {output_dir}\n")

for encoding, content_list in ENCODING_DATA.items():
# Sanitize encoding name for use in filename, replacing cp with win
# for clarity as requested. UHC is an alias for euc-kr in this context.
if encoding.startswith("cp"):
filename_prefix = encoding.replace("cp", "win")
elif encoding == "euc-kr":
filename_prefix = "uhc"
else:
filename_prefix = encoding

file_path = os.path.join(output_dir, "log", f"generic_enc_{filename_prefix}.log")

try:
# Open the file with the specified encoding
with open(file_path, 'w', encoding=encoding) as f:
# Join the list of words with newline characters
f.write('\n'.join(content_list))
f.write('\n')
print(f"Successfully created: {os.path.basename(file_path)} (Encoding: {encoding})")

except UnicodeEncodeError as e:
print(f"Error: Could not encode content for '{encoding}'.")
print(f" - File not created: {os.path.basename(file_path)}")
print(f" - Details: {e}")
except Exception as e:
print(f"An unexpected error occurred for '{encoding}': {e}")

if __name__ == "__main__":
generate_files()
11 changes: 11 additions & 0 deletions tests/runtime/data/tail/log/generic_enc_big5.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
§A¦n
ÁÂÁÂ
¦A¨£
¿ß
ª¯
¦Y
³Ü
¤Ñ
®ü
¤ë«G
ªá¥c
13 changes: 13 additions & 0 deletions tests/runtime/data/tail/log/generic_enc_gb18030.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
ÄãºÃ
лл
ÔÙ¼û
Öйú
è
¹·
³Ô
ºÈ
Ìì
º£
Å·Ôª·ûºÅ¢ã
ý“
ýˆ
12 changes: 12 additions & 0 deletions tests/runtime/data/tail/log/generic_enc_gbk.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
ÄãºÃ
лл
ÔÙ¼û
Öйú
è
¹·
³Ô
ºÈ
Ìì
º£
ÔÂÁÁ
Ȭ
12 changes: 12 additions & 0 deletions tests/runtime/data/tail/log/generic_enc_sjis.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
‚±‚ñ‚É‚¿‚Í
‚ ‚肪‚Æ‚¤
‚³‚悤‚È‚ç
“ú–{
”L
΢
H‚ׂé
ˆù‚Þ
‹ó
ŠC
ŒŽ
‰Ô
12 changes: 12 additions & 0 deletions tests/runtime/data/tail/log/generic_enc_uhc.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
¾È³çÇϼ¼¿ä
°¨»çÇÕ´Ï´Ù
¾È³çÈ÷ °¡¼¼¿ä
Çѱ¹
°í¾çÀÌ
°³
¸Ô´Ù
¸¶½Ã´Ù
ÇÏ´Ã
¹Ù´Ù
´Þ
²É
7 changes: 7 additions & 0 deletions tests/runtime/data/tail/log/generic_enc_win1250.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
CzeϾ
Dziêkujê
Ahoj
Dìkuji
Žluouèký kùò
Gêœl¹ jaŸñ
Árvíztûrõ tükörfúrógép
10 changes: 10 additions & 0 deletions tests/runtime/data/tail/log/generic_enc_win1251.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Ïðèâåò
Ñïàñèáî
Äî ñâèäàíèÿ
Êîøêà
Ñîáàêà
Íåáî
Ìîðå
Áúëãàðñêè åçèê
Óêðà¿íñüêà ìîâà
Áåëàðóñêàÿ ìîâà
Loading
Loading