fluent · edsiper · Jul 6, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025
@@ -826,6 +826,15 @@ static struct flb_config_map config_map[] = {
      "Currently, UTF-16LE, UTF-16BE, auto are supported.",
     },
 #endif
+    {
+     FLB_CONFIG_MAP_STR, "generic.encoding", NULL,
+     0, FLB_FALSE, 0,
+     "specify the preferred input encoding for converting to UTF-8. "
+     "Currently, the following encodings are supported: "
+     "ShiftJIS, UHC, GBK, GB18030, Big5, "
+     "Win866, Win874, "
+     "Win1250, Win1251, Win1252, Win2513, Win1254, Win1255, WIn1256",
+    },
     /* EOF */
     {0}
 };

@@ -36,9 +36,7 @@
 #include "tail_multiline.h"
 #endif
 
-#ifdef FLB_HAVE_UNICODE_ENCODER
 #include <fluent-bit/flb_unicode.h>
-#endif
 
 static int multiline_load_parsers(struct flb_tail_config *ctx)
 {
@@ -114,6 +112,7 @@ struct flb_tail_config *flb_tail_config_create(struct flb_input_instance *ins,
 #ifdef FLB_HAVE_UNICODE_ENCODER
     ctx->preferred_input_encoding = FLB_UNICODE_ENCODING_UNSPECIFIED;
 #endif
+    ctx->generic_input_encoding_type = FLB_GENERIC_UNSPECIFIED; /* Default is unspecified */
 
     /* Load the config map */
     ret = flb_input_config_map_set(ins, (void *) ctx);
@@ -222,6 +221,20 @@ struct flb_tail_config *flb_tail_config_create(struct flb_input_instance *ins,
     }
 #endif
 
+    tmp = flb_input_get_property("generic.encoding", ins);
+    if (tmp) {
+        ret = flb_unicode_generic_select_encoding_type(tmp);
+        if (ret != FLB_GENERIC_UNSPECIFIED) {
+            ctx->generic_input_encoding_type = ret;
+            ctx->generic_input_encoding_name = tmp;
+        }
+        else {
+            flb_plg_error(ctx->ins, "invalid encoding 'generic.encoding' value %s", tmp);
+            flb_free(ctx);
+            return NULL;
+        }
+    }
+
 #ifdef FLB_HAVE_PARSER
     /* Config: multi-line support */
     if (ctx->multiline == FLB_TRUE) {

@@ -129,6 +129,8 @@ struct flb_tail_config {
 #ifdef FLB_HAVE_UNICODE_ENCODER
     int preferred_input_encoding;
 #endif
+    int generic_input_encoding_type;
+    const char *generic_input_encoding_name;
 
     /* Multiline */
     int multiline;             /* multiline enabled ?  */

@@ -48,9 +48,7 @@
 #include "win32.h"
 #endif
 
-#ifdef FLB_HAVE_UNICODE_ENCODER
 #include <fluent-bit/flb_unicode.h>
-#endif
 
 #include <cfl/cfl.h>
 
@@ -445,8 +443,8 @@ static int process_content(struct flb_tail_file *file, size_t *bytes)
     time_t now = time(NULL);
     struct flb_time out_time = {0};
     struct flb_tail_config *ctx;
-#ifdef FLB_HAVE_UNICODE_ENCODER
     char *decoded = NULL;
+#ifdef FLB_HAVE_UNICODE_ENCODER
     size_t decoded_len;
 #endif
 
@@ -485,6 +483,20 @@ static int process_content(struct flb_tail_file *file, size_t *bytes)
         }
     }
 #endif
+    if (ctx->generic_input_encoding_type != FLB_GENERIC_UNSPECIFIED) {
+        original_len = end - data;
+        decoded = NULL;
+        ret = flb_unicode_generic_convert_to_utf8(ctx->generic_input_encoding_name,
+                                                  (unsigned char*)data, (unsigned char**)&decoded,
+                                                  end - data);
+        if (ret > 0) {
+            data = decoded;
+            end  = data + strlen(decoded);
+        }
+        else {
+            flb_plg_error(ctx->ins, "encoding failed '%.*s' with status %d", end - data, data, ret);
+        }
+    }
 
     /* Skip null characters from the head (sometimes introduced by copy-truncate log rotation) */
     while (data < end && *data == '\0') {

@@ -0,0 +1,199 @@
+# -*- coding: utf-8 -*-
+
+import os
+
+# This script generates a set of text files for testing various character encodings.
+# Each file contains a curated list of common, neutral words appropriate for the
+# target language and encoding.
+#
+# The word lists specifically exclude:
+# - Religious terminology
+# - Names of capital cities
+#
+# To use this script:
+# 1. Save it as a Python file (e.g., `generate_files.py`).
+# 2. Run it from your terminal: `python generate_files.py`
+# 3. The script will create several .txt files in the same directory.
+
+# Dictionary of encodings and their corresponding test data.
+# The keys are the encoding names (and will be used in the filenames).
+# The values are lists of strings to be written to the files.
+ENCODING_DATA = {
+    # --- East Asian Encodings ---
+    "sjis": [
+        "ã“ã‚“ã«ã¡ã¯",  # Hello
+        "ã‚ã‚ŠãŒã¨ã†",  # Thank you
+        "ã•ã‚ˆã†ãªã‚‰",  # Goodbye
+        "æ—¥æœ¬",        # Japan
+        "çŒ«",          # Cat
+        "çŠ¬",          # Dog
+        "é£Ÿã¹ã‚‹",      # To eat
+        "é£²ã‚€",        # To drink
+        "ç©º",          # Sky
+        "æµ·",         # Sea
+        "æœˆ",          # Moon
+        "èŠ±",          # Flower
+    ],
+    "big5": [
+        "ä½ å¥½",        # Hello
+        "è¬è¬",        # Thank you
+        "å†è¦‹",        # Goodbye
+        "è²“",          # Cat
+        "ç‹—",          # Dog
+        "åƒ",          # To eat
+        "å–",          # To drink
+        "å¤©",          # Sky
+        "æµ·",          # Sea
+        "æœˆäº®",        # Moon
+        "èŠ±å‰",        # Flower
+    ],
+    "gbk": [
+        "ä½ å¥½",        # Hello
+        "è°¢è°¢",        # Thank you
+        "å†è§",        # Goodbye
+        "ä¸å›½",        # China
+        "çŒ«",          # Cat
+        "ç‹—",          # Dog
+        "åƒ",          # To eat
+        "å–",          # To drink
+        "å¤©",          # Sky
+        "æµ·",          # Sea
+        "æœˆäº®",        # Moon
+        "èŠ±",          # Flower
+    ],
+    "gb18030": [ # Superset of GBK, can include the same + more
+        "ä½ å¥½", "è°¢è°¢", "å†è§", "ä¸å›½", "çŒ«", "ç‹—", "åƒ", "å–", "å¤©", "æµ·",
+        "æ¬§å…ƒç¬¦å·â‚¬", # Euro symbol to test expanded range
+        "é¾˜", "é¾", # Complex characters
+    ],
+    "euc-kr": [ # Often used for Korean, UHC is a Microsoft equivalent
+        "ì•ˆë…•í•˜ì„¸ìš”",  # Hello
+        "ê°ì‚¬í•©ë‹ˆë‹¤",  # Thank you
+        "ì•ˆë…•ížˆ ê°€ì„¸ìš”",# Goodbye
+        "í•œêµ",        # Korea
+        "ê³ ì–‘ì´",      # Cat
+        "ê°œ",          # Dog
+        "ë¨¹ë‹¤",        # To eat
+        "ë§ˆì‹œë‹¤",      # To drink
+        "í•˜ëŠ˜",        # Sky
+        "ë°”ë‹¤",        # Sea
+        "ë‹¬",          # Moon
+        "ê½ƒ",          # Flower
+    ],
+
+    # --- Windows Codepage Encodings ---
+    "cp866": [ # Cyrillic (DOS)
+        "ÐŸÑ€Ð¸Ð²ÐµÑ‚",      # Hello
+        "Ð¡Ð¿Ð°ÑÐ¸Ð±Ð¾",      # Thank you
+        "Ð”Ð¾ ÑÐ²Ð¸Ð´Ð°Ð½Ð¸Ñ", # Goodbye
+        "ÐšÐ¾Ð¼Ð¿ÑŒÑŽÑ‚ÐµÑ€",   # Computer
+        "Ð˜Ð½Ñ„Ð¾Ñ€Ð¼Ð°Ñ†Ð¸Ñ",  # Information
+        "ÐŸÑ€Ð¾Ð³Ñ€Ð°Ð¼Ð¼Ð°",   # Program
+        "Ð¤Ð°Ð¹Ð»",        # File
+    ],
+    "cp874": [ # Thai
+        "à¸ªà¸§à¸±à¸ªà¸”à¸µ",     # Hello
+        "à¸‚à¸à¸šà¸„à¸¸à¸“",     # Thank you
+        "à¸¥à¸²à¸à¹ˆà¸à¸™",     # Goodbye
+        "à¸ à¸²à¸©à¸²à¹„à¸—à¸¢",   # Thai language
+        "à¹à¸¡à¸§",         # Cat
+        "à¸ªà¸¸à¸™à¸±à¸‚",       # Dog
+        "à¸à¸´à¸™",         # Eat
+        "à¸”à¸·à¹ˆà¸¡",        # Drink
+    ],
+    "cp1250": [ # Central European (Polish, Czech, etc.)
+        "CzeÅ›Ä‡", "DziÄ™kujÄ™", # Polish
+        "Ahoj", "DÄ›kuji",     # Czech
+        "Å½luÅ¥ouÄkÃ½ kÅ¯Åˆ",      # Czech phrase with diacritics
+        "GÄ™Å›lÄ… jaÅºÅ„",         # Polish phrase with diacritics
+        "ÃrvÃztÅ±rÅ‘ tÃ¼kÃ¶rfÃºrÃ³gÃ©p", # Hungarian
+    ],
+    "cp1251": [ # Cyrillic (Windows)
+        "ÐŸÑ€Ð¸Ð²ÐµÑ‚", "Ð¡Ð¿Ð°ÑÐ¸Ð±Ð¾", "Ð”Ð¾ ÑÐ²Ð¸Ð´Ð°Ð½Ð¸Ñ",
+        "ÐšÐ¾ÑˆÐºÐ°", "Ð¡Ð¾Ð±Ð°ÐºÐ°", "ÐÐµÐ±Ð¾", "ÐœÐ¾Ñ€Ðµ",
+        "Ð‘ÑŠÐ»Ð³Ð°Ñ€ÑÐºÐ¸ ÐµÐ·Ð¸Ðº", # Bulgarian
+        "Ð£ÐºÑ€Ð°Ñ—Ð½ÑÑŒÐºÐ° Ð¼Ð¾Ð²Ð°",# Ukrainian
+        "Ð‘ÐµÐ»Ð°Ñ€ÑƒÑÐºÐ°Ñ Ð¼Ð¾Ð²Ð°",# Belarusian
+    ],
+    "cp1252": [ # Western European
+        "Hello", "Thank you", "Goodbye", # English
+        "Bonjour", "Merci", "Au revoir", # French
+        "Hallo", "Danke", "Auf Wiedersehen", # German
+        "Hola", "Gracias", "AdiÃ³s", # Spanish
+        "CrÃ¨me brÃ»lÃ©e", "PiÃ±ata", "Fjord",
+    ],
+    "cp1253": [ # Greek
+        "Î“ÎµÎ¹Î¬ ÏƒÎ¿Ï…",    # Hello
+        "Î•Ï…Ï‡Î±ÏÎ¹ÏƒÏ„ÏŽ",   # Thank you
+        "Î‘Î½Ï„Î¯Î¿",       # Goodbye
+        "Î•Î»Î»Î·Î½Î¹ÎºÎ¬",    # Greek
+        "Î“Î¬Ï„Î±",        # Cat
+        "Î£ÎºÏÎ»Î¿Ï‚",      # Dog
+        "ÎŸÏ…ÏÎ±Î½ÏŒÏ‚",     # Sky
+        "Î˜Î¬Î»Î±ÏƒÏƒÎ±",     # Sea
+    ],
+    "cp1254": [ # Turkish
+        "Merhaba", "TeÅŸekkÃ¼r ederim", "HoÅŸÃ§a kal",
+        "TÃ¼rkiye", "Kedi", "KÃ¶pek",
+        "Yemek", "Ä°Ã§mek", "GÃ¶k", "Deniz",
+        "Ã–ÄŸrenci", "IÅŸÄ±k", "AÄŸaÃ§", # Words with specific Turkish chars
+    ],
+    "cp1255": [ # Hebrew
+        "×©×œ×•×",        # Hello/Peace
+        "×ª×•×“×”",        # Thank you
+        "×œ×”×ª×¨××•×ª",     # Goodbye
+        "×¢×‘×¨×™×ª",       # Hebrew
+        "×—×ª×•×œ",        # Cat
+        "×›×œ×‘",         # Dog
+        "×©×ž×™×™×",       # Sky
+        "×™×",          # Sea
+    ],
+    "cp1256": [ # Arabic
+        "Ù…Ø±ØØ¨Ø§",       # Hello
+        "Ø´ÙƒØ±Ø§",        # Thank you
+        "Ù…Ø¹ Ø§Ù„Ø³Ù„Ø§Ù…Ø©",  # Goodbye
+        "Ø§Ù„Ø¹Ø±Ø¨ÙŠØ©",     # Arabic
+        "Ù‚Ø·",          # Cat
+        "ÙƒÙ„Ø¨",         # Dog
+        "Ø³Ù…Ø§Ø¡",        # Sky
+        "Ø¨ØØ±",         # Sea
+    ],
+}
+
+def generate_files():
+    """
+    Iterates through the ENCODING_DATA dictionary and creates a file for each entry.
+    """
+    # Get the directory where the script is running to save files there.
+    output_dir = os.path.dirname(os.path.abspath(__file__))
+    print(f"Files will be generated in: {output_dir}\n")
+
+    for encoding, content_list in ENCODING_DATA.items():
+        # Sanitize encoding name for use in filename, replacing cp with win
+        # for clarity as requested. UHC is an alias for euc-kr in this context.
+        if encoding.startswith("cp"):
+            filename_prefix = encoding.replace("cp", "win")
+        elif encoding == "euc-kr":
+            filename_prefix = "uhc"
+        else:
+            filename_prefix = encoding
+
+        file_path = os.path.join(output_dir, "log", f"generic_enc_{filename_prefix}.log")
+
+        try:
+            # Open the file with the specified encoding
+            with open(file_path, 'w', encoding=encoding) as f:
+                # Join the list of words with newline characters
+                f.write('\n'.join(content_list))
+                f.write('\n')
+            print(f"Successfully created: {os.path.basename(file_path)} (Encoding: {encoding})")
+
+        except UnicodeEncodeError as e:
+            print(f"Error: Could not encode content for '{encoding}'.")
+            print(f"  - File not created: {os.path.basename(file_path)}")
+            print(f"  - Details: {e}")
+        except Exception as e:
+            print(f"An unexpected error occurred for '{encoding}': {e}")
+
+if __name__ == "__main__":
+    generate_files()
@@ -0,0 +1,11 @@
+§A¦n
+ÁÂÁÂ
+¦A¨£
+¿ß
+ª¯
+¦Y
+³Ü
+¤Ñ
+®ü
+¤ë«G
+ªá¥c
@@ -0,0 +1,13 @@
+ÄãºÃ
+Ð»Ð»
+ÔÙ¼û
+ÖÐ¹ú
+Ã¨
+¹·
+³Ô
+ºÈ
+Ìì
+º£
+Å·Ôª·ûºÅ¢ã
+ý“
+ýˆ
@@ -0,0 +1,12 @@
+ÄãºÃ
+Ð»Ð»
+ÔÙ¼û
+ÖÐ¹ú
+Ã¨
+¹·
+³Ô
+ºÈ
+Ìì
+º£
+ÔÂÁÁ
+»¨
@@ -0,0 +1,12 @@
+‚±‚ñ‚É‚¿‚Í
+‚ ‚è‚ª‚Æ‚¤
+‚³‚æ‚¤‚È‚ç
+“ú–{
+”L
+Œ¢
+H‚×‚é
+ˆù‚Þ
+‹ó
+ŠC
+ŒŽ
+‰Ô
@@ -0,0 +1,12 @@
+¾È³çÇÏ¼¼¿ä
+°¨»çÇÕ´Ï´Ù
+¾È³çÈ÷ °¡¼¼¿ä
+ÇÑ±¹
+°í¾çÀÌ
+°³
+¸Ô´Ù
+¸¶½Ã´Ù
+ÇÏ´Ã
+¹Ù´Ù
+´Þ
+²É
@@ -0,0 +1,7 @@
+Czeœæ
+Dziêkujê
+Ahoj
+Dìkuji
+Žluouèký kùò
+Gêœl¹ jaŸñ
+Árvíztûrõ tükörfúrógép
@@ -0,0 +1,10 @@
+Ïðèâåò
+Ñïàñèáî
+Äî ñâèäàíèÿ
+Êîøêà
+Ñîáàêà
+Íåáî
+Ìîðå
+Áúëãàðñêè åçèê
+Óêðà¿íñüêà ìîâà
+Áåëàðóñêàÿ ìîâà