~aleteoryx/muditaos

9ba061a5446ac449d04f139eedd38c900c467126 — Bartosz 2 years ago 6ed4069
[MOS-000] Reworked translation verification tool

Added removal of unused keys in files
Refactored code
1 files changed, 163 insertions(+), 57 deletions(-)

M tools/verify_translations.py
M tools/verify_translations.py => tools/verify_translations.py +163 -57
@@ 3,10 3,11 @@
# For licensing, see https://github.com/mudita/MuditaOS/LICENSE.md

import collections
import os
import os.path as path
import shutil
from pathlib import Path
import argparse
import json
import subprocess
import sys
import logging
import textwrap


@@ 17,106 18,209 @@ handler = logging.StreamHandler(sys.stdout)
logger.addHandler(handler)


def detect_duplicate_keys(list_of_pairs: list):
    key_count = collections.Counter(k for k, v in list_of_pairs)
    duplicate_keys = ', '.join(k for k, v in key_count.items() if v > 1)
# note: ripgrep is required for this tool
def detect_duplicate_keys(list_of_pairs):
    key_count = collections.Counter(key for key, value in list_of_pairs)
    duplicate_keys = [key for key, count in key_count.items() if count > 1]

    if duplicate_keys:
        raise ValueError(duplicate_keys)
        raise ValueError(", ".join(duplicate_keys))


def copy_folder_contents(src_folder: Path, dst_folder: Path):
    dst_folder.mkdir(parents=True, exist_ok=True)

    for file_path in src_folder.glob("*"):
        if file_path.is_file():
            shutil.copy2(file_path, dst_folder / file_path.name)


def write_all_keys_to_file(json_path: Path, output_path: Path):
    with json_path.open() as json_file:
        json_data = json.load(json_file)
        keys = json_data.keys()

    with output_path.open(mode='w') as output_file:
        output_file.write('\n'.join(keys))


def validate_data(list_of_pairs: list):
    detect_duplicate_keys(list_of_pairs)
    # More detection, each of them will raise exception upon invalid
    # data
    return dict(list_of_pairs)


def perform_on_files_from_path(json_path: path, operation):
    dir_list = os.listdir(json_path)
def perform_on_files_from_path(json_path: Path, operation):
    json_files = json_path.glob('*.json')
    ret = 0

    for file in dir_list:
        file_path = path.join(json_path, file)
        with open(file_path) as json_file:
    for file_path in json_files:
        with file_path.open() as json_file:
            ret |= operation(file_path, json_file)

    return ret


def check_duplicates(file_path: path, json_file):
def check_duplicates(file_path: Path, json_file):
    try:
        _ = json.load(json_file, object_pairs_hook=validate_data)
    except ValueError as e:
        dup_list = str(e).split(',')
        logger.warning(f"[{path.basename(file_path)}]: duplicate {len(dup_list)}: {dup_list}")
        duplicate_keys = [key.strip() for key in str(e).split(',') if key.strip()]
        logger.debug(f"[{file_path.name}]: duplicate {len(duplicate_keys)}: {', '.join(duplicate_keys)}")
        return 1
    return 0


def check_empty_entries(file_path: path, json_file):
def check_empty_entries(file_path: Path, json_file):
    json_data = json.load(json_file)
    empty_entries = [entry for entry in json_data if len(str(json_data[entry])) == 0]
    empty_entries = [entry for entry, value in json_data.items() if not value]
    if empty_entries:
        logger.warning(f"[{path.basename(file_path)}]: empty entries {len(empty_entries)}: {empty_entries}")
        logger.debug(f"[{file_path.name}]: empty entries {len(empty_entries)}: {empty_entries}")
        return 1
    return 0


def get_all_keys_from_path(json_path: path):
    dir_list = os.listdir(json_path)
    json_keys = []

    # iterate to get all possible keys and check for key duplicates
    for file in dir_list:
        file_path = path.join(json_path, file)
def get_all_keys_from_path(json_path: Path) -> set[str]:
    json_keys = set()

        with open(file_path) as json_file:
    for file_path in json_path.glob('*.json'):
        with file_path.open() as json_file:
            json_data = json.load(json_file)
            json_keys.append(set(json_data))
            json_keys |= set(json_data.keys())

    return set.union(*json_keys)
    return json_keys


def check_missing_entries_from_path(json_path: path):
    ret = 0
    dir_list = os.listdir(json_path)
def check_missing_entries_from_path(json_path: Path) -> int:
    all_keys = get_all_keys_from_path(json_path)
    ret = 0

    # iterate to find missing keys
    for file in dir_list:
        file_path = path.join(json_path, file)
        with open(file_path) as json_file:
    for file_path in json_path.glob('*.json'):
        with file_path.open() as json_file:
            json_data = json.load(json_file)
            missing_keys_in_file = all_keys.difference(set(json_data))
            missing_keys_in_file = all_keys - set(json_data.keys())

            if missing_keys_in_file:
                logger.warning(f"[{file}]: missing {len(missing_keys_in_file)}: {sorted(missing_keys_in_file)}")
                ret |= 1
                with (file_path.with_suffix('.pattern')).open('w') as pattern_file:
                    pattern_file.write('\n'.join(missing_keys_in_file))
                ret = 1

    return ret


def fix_jsons(json_src_path: path, json_dst_path: path):
    dir_list = os.listdir(json_src_path)
    for file in dir_list:
        src_file_path = path.join(json_src_path, file)
        dst_file_path = path.join(json_dst_path, file)
        if not path.exists(json_dst_path):
            os.makedirs(json_dst_path)
def fix_json(dst_path: Path):
    with open(dst_path) as dst_file:
        json_data = json.load(dst_file)

    with open(dst_path, 'w') as dst_file:
        json.dump(json_data, dst_file, indent=4, sort_keys=True)


def fix_jsons(json_dst_path: Path):
    if not json_dst_path.exists():
        json_dst_path.mkdir(parents=True)

    for file_path in json_dst_path.glob("*.json"):
        dst_file_path = file_path

        fix_json(dst_file_path)

    logger.debug("Translation files fixed")


def verify_keys_code_usage(pattern_src_path: Path, pattern_file=None):
    unused_keys = []
    used_keys = []

        with open(src_file_path) as json_file, open(dst_file_path, 'w') as outfile:
    if pattern_file is None:
        file_list = list(pattern_src_path.glob("*.pattern"))
    else:
        pattern_file_path = pattern_src_path / pattern_file
        if not pattern_file_path.exists():
            raise ValueError(f"Pattern file {pattern_file_path} not found.")
        file_list = [pattern_file_path]

    for pattern_path in file_list:
        with pattern_path.open("r") as file:
            lines = [line.strip() for line in file if line.strip()]
            rg_result = subprocess.run(
                ["rg", "-f", str(pattern_path), "-g", f"!{pattern_src_path}", "-T", "json", ".."],
                stdout=subprocess.PIPE,
            ).stdout.decode("UTF-8")

            for line in lines:
                if line in rg_result:
                    used_keys.append(line)
                else:
                    unused_keys.append(line)

        pattern_path.unlink()

    return set(unused_keys), set(used_keys)


def remove_unused_keys(json_dst_path: Path, unused_keys: set):
    if not json_dst_path.exists():
        json_dst_path.mkdir(parents=True)

    for file in json_dst_path.glob("*.json"):
        with file.open() as json_file:
            json_data = json.load(json_file)
        for key in unused_keys:
            json_data.pop(key, None)

        temp_path = file.with_suffix(".tmp")
        with temp_path.open(mode='w') as outfile:
            json.dump(json_data, outfile, indent=4, sort_keys=True)
        shutil.move(str(temp_path), str(file))

    logger.debug("Translation files cleaned up from unused keys")


def get_missing_and_used_keys_for_files(json_path: Path, used_keys: set):
    ret = 0
    dir_list = [x.name for x in json_path.glob("*.json")]

    logger.info("Translation files fixed")
    # iterate to find missing keys
    for file in dir_list:
        file_path = json_path / file
        with file_path.open() as json_file:
            json_data = json.load(json_file)
            missing_keys_in_file = used_keys.difference(set(json_data))

            if missing_keys_in_file:
                logger.debug(
                    f"[{file}]: missing and used {len(missing_keys_in_file)}: {sorted(missing_keys_in_file)}")
                ret |= 1
    return ret


def main(args):
    ret = 0
    src_path = Path(args.src)
    dst_path = Path(args.dst) if args.dst else None

    if args.fix:
        fix_jsons(args.src, args.dst)
        copy_folder_contents(src_path, dst_path)
        fix_jsons(dst_path)

        # check for usage of English.json entries in the code
        write_all_keys_to_file(dst_path / "English.json", dst_path / "English.keys")
        not_used_keys, _ = verify_keys_code_usage(dst_path, "English.keys")
        if not_used_keys:
            logger.critical(f"unused english keys:  {len(not_used_keys)}: {not_used_keys}")

        remove_unused_keys(dst_path, not_used_keys)
        missing_not_used_keys, missing_used_keys = verify_keys_code_usage(src_path)
        ret |= get_missing_and_used_keys_for_files(src_path, missing_used_keys)
        remove_unused_keys(dst_path, missing_not_used_keys)

    ret |= perform_on_files_from_path(src_path, check_empty_entries)
    ret |= perform_on_files_from_path(src_path, check_duplicates)
    ret |= check_missing_entries_from_path(src_path)

    for file in src_path.glob("*.pattern"):
        file.unlink()

    ret |= check_missing_entries_from_path(args.src)
    ret |= perform_on_files_from_path(args.src, check_empty_entries)
    ret |= perform_on_files_from_path(args.src, check_duplicates)
    return ret




@@ 124,14 228,16 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        prog='verify_translations',
        description='Script for checking the inconsistency of lang jsons',
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-s', '--src', metavar='path', help="source path to the json files", required=True)
        formatter_class=argparse.RawTextHelpFormatter
    )

    parser.add_argument('-s', '--src', metavar='path', type=Path, help="source path to the json files", required=True)
    parser.add_argument('--fix', action='store_true', help=textwrap.dedent('''\
        fix the translation files: remove duplicates and sort
        fix the translation files: remove duplicates, remove unused keys and sort
        WARNING! this will overwrite your destination files!
        
    
        Use with caution!'''))
    parser.add_argument('-d', '--dst', metavar='path', help="destination path for the fixed json files")
    parser.add_argument('-d', '--dst', metavar='path', type=Path, help="destination path for the fixed json files")
    parser.add_argument('-v', '--verbose', action='store_true')

    args = parser.parse_args()