attardi · wayneworkman · Mar 31, 2023
diff --git a/README.md b/README.md
@@ -56,8 +56,7 @@ Each file will contains several documents in this [document format](https://gith
 
 ```
 usage: wikiextractor [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html] [-l] [-ns ns1,ns2]
-			 [--templates TEMPLATES] [--no-templates] [--html-safe HTML_SAFE] [--processes PROCESSES]
-			 [-q] [--debug] [-a] [-v]
+			 [--preserve-unicode] [--templates TEMPLATES] [--no-templates] [--html-safe HTML_SAFE] [--processes PROCESSES] [-q] [--debug] [-a] [-v]
 			 input
 
 Wikipedia Extractor:
@@ -93,6 +92,8 @@ Output:
 			    maximum bytes per output file (default 1M)
   -c, --compress        compress output files using bzip
   --json                write output in json format instead of the default <doc> format
+  --preserve-unicode
+          Do not convert unicode characters to ascii characters when using JSON output
 
 Processing:
   --html                produce HTML output, subsumes --links

diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py
@@ -548,6 +548,8 @@ def main():
                         help="compress output files using bzip")
     groupO.add_argument("--json", action="store_true",
                         help="write output in json format instead of the default <doc> format")
+    groupO.add_argument("--preserve-unicode", action="store_true",
+                        help="Do not convert unicode characters to ascii characters when using JSON output")
 
     groupP = parser.add_argument_group('Processing')
     groupP.add_argument("--html", action="store_true",
@@ -584,6 +586,7 @@ def main():
     if args.html:
         Extractor.keepLinks = True
     Extractor.to_json = args.json
+    Extractor.preserve_unicode = args.preserve_unicode
 
     try:
         power = 'kmg'.find(args.bytes[-1].lower()) + 1

diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
@@ -982,7 +982,10 @@ def extract(self, out, html_safe=True):
                 'title': self.title,
                 'text': "\n".join(text)
             }
-            out_str = json.dumps(json_data)
+            if self.preserve_unicode:
+                out_str = json.dumps(json_data, ensure_ascii=False)
+            else:
+                out_str = json.dumps(json_data)
             out.write(out_str)
             out.write('\n')
         else: