diff --git a/README.md b/README.md index bd5407c..6239e18 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,7 @@ Each file will contains several documents in this [document format](https://gith ``` usage: wikiextractor [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html] [-l] [-ns ns1,ns2] - [--templates TEMPLATES] [--no-templates] [--html-safe HTML_SAFE] [--processes PROCESSES] - [-q] [--debug] [-a] [-v] + [--preserve-unicode] [--templates TEMPLATES] [--no-templates] [--html-safe HTML_SAFE] [--processes PROCESSES] [-q] [--debug] [-a] [-v] input Wikipedia Extractor: @@ -93,6 +92,8 @@ Output: maximum bytes per output file (default 1M) -c, --compress compress output files using bzip --json write output in json format instead of the default format + --preserve-unicode + Do not convert unicode characters to ascii characters when using JSON output Processing: --html produce HTML output, subsumes --links diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 830235d..31f8cdc 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -548,6 +548,8 @@ def main(): help="compress output files using bzip") groupO.add_argument("--json", action="store_true", help="write output in json format instead of the default format") + groupO.add_argument("--preserve-unicode", action="store_true", + help="Do not convert unicode characters to ascii characters when using JSON output") groupP = parser.add_argument_group('Processing') groupP.add_argument("--html", action="store_true", @@ -584,6 +586,7 @@ def main(): if args.html: Extractor.keepLinks = True Extractor.to_json = args.json + Extractor.preserve_unicode = args.preserve_unicode try: power = 'kmg'.find(args.bytes[-1].lower()) + 1 diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index a00e23d..641863d 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -982,7 +982,10 @@ def extract(self, out, html_safe=True): 'title': self.title, 'text': "\n".join(text) } - out_str = json.dumps(json_data) + if self.preserve_unicode: + out_str = json.dumps(json_data, ensure_ascii=False) + else: + out_str = json.dumps(json_data) out.write(out_str) out.write('\n') else: