From 4f7356f19af81a60a6c3beeccb6cfd34ee1c519c Mon Sep 17 00:00:00 2001 From: Shivaram Lingamneni Date: Thu, 9 Dec 2021 22:11:24 -0500 Subject: [PATCH] anope2json, atheme2json: handle non-UTF8 data Also ignore an unrecognized field type in anope --- distrib/anope/anope2json.py | 18 +++++++++++++++--- distrib/atheme/atheme2json.py | 14 +++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) mode change 100644 => 100755 distrib/anope/anope2json.py diff --git a/distrib/anope/anope2json.py b/distrib/anope/anope2json.py old mode 100644 new mode 100755 index 80225e0f..02fb529b --- a/distrib/anope/anope2json.py +++ b/distrib/anope/anope2json.py @@ -46,8 +46,17 @@ def to_unixnano(timestamp): def file_to_objects(infile): result = [] obj = None - for line in infile: - pieces = line.rstrip('\r\n').split(' ', maxsplit=2) + while True: + line = infile.readline() + if not line: + break + line = line.rstrip(b'\r\n') + try: + line = line.decode('utf-8') + except UnicodeDecodeError: + line = line.decode('utf-8', 'replace') + logging.warning("line contained invalid utf8 data " + line) + pieces = line.split(' ', maxsplit=2) if len(pieces) == 0: logging.warning("skipping blank line in db") continue @@ -58,6 +67,9 @@ def file_to_objects(infile): obj = AnopeObject(pieces[1], {}) elif pieces[0] == 'DATA': obj.kv[pieces[1]] = pieces[2] + elif pieces[0] == 'ID': + # not sure what these do? + continue else: raise ValueError("unknown command found in anope db", pieces[0]) return result @@ -167,7 +179,7 @@ def convert(infile): def main(): if len(sys.argv) != 3: raise Exception("Usage: anope2json.py anope.db output.json") - with open(sys.argv[1]) as infile: + with open(sys.argv[1], 'rb') as infile: output = convert(infile) with open(sys.argv[2], 'w') as outfile: json.dump(output, outfile) diff --git a/distrib/atheme/atheme2json.py b/distrib/atheme/atheme2json.py index 2659b92d..23766e00 100755 --- a/distrib/atheme/atheme2json.py +++ b/distrib/atheme/atheme2json.py @@ -31,8 +31,16 @@ def convert(infile): channel_to_founder = defaultdict(lambda: (None, None)) - for line in infile: - line = line.rstrip('\r\n') + while True: + line = infile.readline() + if not line: + break + line = line.rstrip(b'\r\n') + try: + line = line.decode('utf-8') + except UnicodeDecodeError: + line = line.decode('utf-8', 'replace') + logging.warning("line contained invalid utf8 data " + line) parts = line.split(' ') category = parts[0] @@ -177,7 +185,7 @@ def convert(infile): def main(): if len(sys.argv) != 3: raise Exception("Usage: atheme2json.py atheme_db output.json") - with open(sys.argv[1]) as infile: + with open(sys.argv[1], 'rb') as infile: output = convert(infile) with open(sys.argv[2], 'w') as outfile: json.dump(output, outfile)