From 9ba061a5446ac449d04f139eedd38c900c467126 Mon Sep 17 00:00:00 2001 From: Bartosz Date: Tue, 4 Apr 2023 09:17:18 +0200 Subject: [PATCH] [MOS-000] Reworked translation verification tool Added removal of unused keys in files Refactored code --- tools/verify_translations.py | 220 ++++++++++++++++++++++++++--------- 1 file changed, 163 insertions(+), 57 deletions(-) diff --git a/tools/verify_translations.py b/tools/verify_translations.py index 8f5966bfd5f1221f6fbe6dc8a331fdcac480671a..74b59555e50c52da0e9cee940e6d27e0f58ce87b 100755 --- a/tools/verify_translations.py +++ b/tools/verify_translations.py @@ -3,10 +3,11 @@ # For licensing, see https://github.com/mudita/MuditaOS/LICENSE.md import collections -import os -import os.path as path +import shutil +from pathlib import Path import argparse import json +import subprocess import sys import logging import textwrap @@ -17,106 +18,209 @@ handler = logging.StreamHandler(sys.stdout) logger.addHandler(handler) -def detect_duplicate_keys(list_of_pairs: list): - key_count = collections.Counter(k for k, v in list_of_pairs) - duplicate_keys = ', '.join(k for k, v in key_count.items() if v > 1) +# note: ripgrep is required for this tool +def detect_duplicate_keys(list_of_pairs): + key_count = collections.Counter(key for key, value in list_of_pairs) + duplicate_keys = [key for key, count in key_count.items() if count > 1] if duplicate_keys: - raise ValueError(duplicate_keys) + raise ValueError(", ".join(duplicate_keys)) + + +def copy_folder_contents(src_folder: Path, dst_folder: Path): + dst_folder.mkdir(parents=True, exist_ok=True) + + for file_path in src_folder.glob("*"): + if file_path.is_file(): + shutil.copy2(file_path, dst_folder / file_path.name) + + +def write_all_keys_to_file(json_path: Path, output_path: Path): + with json_path.open() as json_file: + json_data = json.load(json_file) + keys = json_data.keys() + + with output_path.open(mode='w') as output_file: + output_file.write('\n'.join(keys)) def validate_data(list_of_pairs: list): detect_duplicate_keys(list_of_pairs) - # More detection, each of them will raise exception upon invalid - # data return dict(list_of_pairs) -def perform_on_files_from_path(json_path: path, operation): - dir_list = os.listdir(json_path) +def perform_on_files_from_path(json_path: Path, operation): + json_files = json_path.glob('*.json') ret = 0 - for file in dir_list: - file_path = path.join(json_path, file) - with open(file_path) as json_file: + for file_path in json_files: + with file_path.open() as json_file: ret |= operation(file_path, json_file) + return ret -def check_duplicates(file_path: path, json_file): +def check_duplicates(file_path: Path, json_file): try: _ = json.load(json_file, object_pairs_hook=validate_data) except ValueError as e: - dup_list = str(e).split(',') - logger.warning(f"[{path.basename(file_path)}]: duplicate {len(dup_list)}: {dup_list}") + duplicate_keys = [key.strip() for key in str(e).split(',') if key.strip()] + logger.debug(f"[{file_path.name}]: duplicate {len(duplicate_keys)}: {', '.join(duplicate_keys)}") return 1 return 0 -def check_empty_entries(file_path: path, json_file): +def check_empty_entries(file_path: Path, json_file): json_data = json.load(json_file) - empty_entries = [entry for entry in json_data if len(str(json_data[entry])) == 0] + empty_entries = [entry for entry, value in json_data.items() if not value] if empty_entries: - logger.warning(f"[{path.basename(file_path)}]: empty entries {len(empty_entries)}: {empty_entries}") + logger.debug(f"[{file_path.name}]: empty entries {len(empty_entries)}: {empty_entries}") return 1 return 0 -def get_all_keys_from_path(json_path: path): - dir_list = os.listdir(json_path) - json_keys = [] - - # iterate to get all possible keys and check for key duplicates - for file in dir_list: - file_path = path.join(json_path, file) +def get_all_keys_from_path(json_path: Path) -> set[str]: + json_keys = set() - with open(file_path) as json_file: + for file_path in json_path.glob('*.json'): + with file_path.open() as json_file: json_data = json.load(json_file) - json_keys.append(set(json_data)) + json_keys |= set(json_data.keys()) - return set.union(*json_keys) + return json_keys -def check_missing_entries_from_path(json_path: path): - ret = 0 - dir_list = os.listdir(json_path) +def check_missing_entries_from_path(json_path: Path) -> int: all_keys = get_all_keys_from_path(json_path) + ret = 0 - # iterate to find missing keys - for file in dir_list: - file_path = path.join(json_path, file) - with open(file_path) as json_file: + for file_path in json_path.glob('*.json'): + with file_path.open() as json_file: json_data = json.load(json_file) - missing_keys_in_file = all_keys.difference(set(json_data)) + missing_keys_in_file = all_keys - set(json_data.keys()) + if missing_keys_in_file: - logger.warning(f"[{file}]: missing {len(missing_keys_in_file)}: {sorted(missing_keys_in_file)}") - ret |= 1 + with (file_path.with_suffix('.pattern')).open('w') as pattern_file: + pattern_file.write('\n'.join(missing_keys_in_file)) + ret = 1 + return ret -def fix_jsons(json_src_path: path, json_dst_path: path): - dir_list = os.listdir(json_src_path) - for file in dir_list: - src_file_path = path.join(json_src_path, file) - dst_file_path = path.join(json_dst_path, file) - if not path.exists(json_dst_path): - os.makedirs(json_dst_path) +def fix_json(dst_path: Path): + with open(dst_path) as dst_file: + json_data = json.load(dst_file) + + with open(dst_path, 'w') as dst_file: + json.dump(json_data, dst_file, indent=4, sort_keys=True) + + +def fix_jsons(json_dst_path: Path): + if not json_dst_path.exists(): + json_dst_path.mkdir(parents=True) + + for file_path in json_dst_path.glob("*.json"): + dst_file_path = file_path + + fix_json(dst_file_path) + + logger.debug("Translation files fixed") + + +def verify_keys_code_usage(pattern_src_path: Path, pattern_file=None): + unused_keys = [] + used_keys = [] - with open(src_file_path) as json_file, open(dst_file_path, 'w') as outfile: + if pattern_file is None: + file_list = list(pattern_src_path.glob("*.pattern")) + else: + pattern_file_path = pattern_src_path / pattern_file + if not pattern_file_path.exists(): + raise ValueError(f"Pattern file {pattern_file_path} not found.") + file_list = [pattern_file_path] + + for pattern_path in file_list: + with pattern_path.open("r") as file: + lines = [line.strip() for line in file if line.strip()] + rg_result = subprocess.run( + ["rg", "-f", str(pattern_path), "-g", f"!{pattern_src_path}", "-T", "json", ".."], + stdout=subprocess.PIPE, + ).stdout.decode("UTF-8") + + for line in lines: + if line in rg_result: + used_keys.append(line) + else: + unused_keys.append(line) + + pattern_path.unlink() + + return set(unused_keys), set(used_keys) + + +def remove_unused_keys(json_dst_path: Path, unused_keys: set): + if not json_dst_path.exists(): + json_dst_path.mkdir(parents=True) + + for file in json_dst_path.glob("*.json"): + with file.open() as json_file: json_data = json.load(json_file) + for key in unused_keys: + json_data.pop(key, None) + + temp_path = file.with_suffix(".tmp") + with temp_path.open(mode='w') as outfile: json.dump(json_data, outfile, indent=4, sort_keys=True) + shutil.move(str(temp_path), str(file)) + + logger.debug("Translation files cleaned up from unused keys") + + +def get_missing_and_used_keys_for_files(json_path: Path, used_keys: set): + ret = 0 + dir_list = [x.name for x in json_path.glob("*.json")] - logger.info("Translation files fixed") + # iterate to find missing keys + for file in dir_list: + file_path = json_path / file + with file_path.open() as json_file: + json_data = json.load(json_file) + missing_keys_in_file = used_keys.difference(set(json_data)) + + if missing_keys_in_file: + logger.debug( + f"[{file}]: missing and used {len(missing_keys_in_file)}: {sorted(missing_keys_in_file)}") + ret |= 1 + return ret def main(args): ret = 0 + src_path = Path(args.src) + dst_path = Path(args.dst) if args.dst else None + if args.fix: - fix_jsons(args.src, args.dst) + copy_folder_contents(src_path, dst_path) + fix_jsons(dst_path) + + # check for usage of English.json entries in the code + write_all_keys_to_file(dst_path / "English.json", dst_path / "English.keys") + not_used_keys, _ = verify_keys_code_usage(dst_path, "English.keys") + if not_used_keys: + logger.critical(f"unused english keys: {len(not_used_keys)}: {not_used_keys}") + + remove_unused_keys(dst_path, not_used_keys) + missing_not_used_keys, missing_used_keys = verify_keys_code_usage(src_path) + ret |= get_missing_and_used_keys_for_files(src_path, missing_used_keys) + remove_unused_keys(dst_path, missing_not_used_keys) + + ret |= perform_on_files_from_path(src_path, check_empty_entries) + ret |= perform_on_files_from_path(src_path, check_duplicates) + ret |= check_missing_entries_from_path(src_path) + + for file in src_path.glob("*.pattern"): + file.unlink() - ret |= check_missing_entries_from_path(args.src) - ret |= perform_on_files_from_path(args.src, check_empty_entries) - ret |= perform_on_files_from_path(args.src, check_duplicates) return ret @@ -124,14 +228,16 @@ if __name__ == "__main__": parser = argparse.ArgumentParser( prog='verify_translations', description='Script for checking the inconsistency of lang jsons', - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('-s', '--src', metavar='path', help="source path to the json files", required=True) + formatter_class=argparse.RawTextHelpFormatter + ) + + parser.add_argument('-s', '--src', metavar='path', type=Path, help="source path to the json files", required=True) parser.add_argument('--fix', action='store_true', help=textwrap.dedent('''\ - fix the translation files: remove duplicates and sort + fix the translation files: remove duplicates, remove unused keys and sort WARNING! this will overwrite your destination files! - + Use with caution!''')) - parser.add_argument('-d', '--dst', metavar='path', help="destination path for the fixed json files") + parser.add_argument('-d', '--dst', metavar='path', type=Path, help="destination path for the fixed json files") parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args()