diff --git a/.github/workflows/mobile-i18n-autofill-pr.yml b/.github/workflows/mobile-i18n-autofill-pr.yml new file mode 100644 index 00000000..74e11ce0 --- /dev/null +++ b/.github/workflows/mobile-i18n-autofill-pr.yml @@ -0,0 +1,100 @@ +name: Mobile i18n Autofill (bot PR) + +on: + workflow_dispatch: + inputs: + locales: + description: 'Comma-separated locale codes to translate' + required: false + default: 'ar,bn,de,en,es,fa,fr,gu,hi,hu,id,km,kn,ml,mr,ms,my,pl,pt,ru,si,sw,te,ur' + type: string + model: + description: 'Gemini model to use for translation' + required: false + default: 'gemma-3-27b-it' + type: string + batch-size: + description: 'Number of strings per translation batch' + required: false + default: '15' + type: string + repo-root: + description: 'Module root to scope translation (e.g. ./feature/settings)' + required: false + default: '.' + type: string + + pull_request: + types: [labeled] + branches: [dev] + +permissions: + contents: write + pull-requests: write + +jobs: + i18n-autofill: + if: >- + github.event_name == 'workflow_dispatch' || + github.event.label.name == 'needs-translation' + runs-on: ubuntu-latest + + env: + LOCALES: ${{ inputs.locales || 'ar,bn,de,en,es,fa,fr,gu,hi,hu,id,km,kn,ml,mr,ms,my,pl,pt,ru,si,sw,te,ur' }} + MODEL: ${{ inputs.model || 'gemma-3-27b-it' }} + BATCH_SIZE: ${{ inputs.batch-size || '15' }} + REPO_ROOT: ${{ inputs.repo-root || '.' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} + ref: ${{ github.event.pull_request.head.ref || github.ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up JDK 21 + uses: actions/setup-java@v4 + with: + java-version: '21' + distribution: 'temurin' + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install google-genai lxml + + - name: Run translation autofill + env: + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + run: | + python translate.py \ + --mode apply \ + --repo-root "$REPO_ROOT" \ + --locales "$LOCALES" \ + --model "$MODEL" \ + --batch-size $BATCH_SIZE + + - name: Validate Android resources compile + run: | + ./gradlew :cmp-android:processDemoDebugResources + + - name: Commit and Push changes + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + git add cmp-android/ cmp-navigation/ feature/ + + if ! git diff --cached --quiet; then + git commit -m "chore: auto-generate mobile i18n translations" + git push origin HEAD:${{ github.event.pull_request.head.ref || github.ref_name }} + else + echo "No changes to commit." + fi diff --git a/docs/TRANSLATE.md b/docs/TRANSLATE.md new file mode 100644 index 00000000..c6ee58e0 --- /dev/null +++ b/docs/TRANSLATE.md @@ -0,0 +1,81 @@ +# Android String Resource Translator (`translate.py`) + +A production-ready Python script for translating Android string resources (`strings.xml` and `arrays.xml`) using the Google Gemini API. + +## Features + +- **Format Preservation**: Ensures comments, spacing (blank lines), and structure match the source file exactly. +- **Placeholder & Markup Safety**: Freezes placeholders (e.g., `%s`, `%1$d`) and markup tags (e.g., ``, ``) before translating to guarantee they are preserved and kept in the correct order. +- **Source Attribute Propagation**: Copies attributes like `formatted`, `product`, and `tools:*` to the translated strings. +- **Robust Error Handling**: Includes batch translation with individual string fallback on failure, and automatic retry mechanisms for rate limits (429) or model overloads (503). +- **Change Detection**: Tracks source strings through a simple hash-based snapshot mechanism (`.translation_snapshots/`). Only new strings and strings whose source text modified are re-translated, saving time and tokens. +- **Advanced Resource Support**: Translates single ``, ordered ``, and `` resources out-of-the-box. +- **Character Compatibility**: Manages HTML entity conversions and robust Android special character escaping. +- **AAPT2 Compatibility**: Implements proper `xliff` namespace handling to prevent build errors. + +## Requirements + +1. Python 3.8+ +2. Required packages: + ```bash + pip install google-genai lxml + ``` +3. A Google Gemini API Key + +## Usage + +Set your Google Gemini API key as an environment variable: +```bash +export GEMINI_API_KEY=your_api_key_here +``` +*(You can customize the environment variable name via the `--api-key-env` flag).* + +### Applying Translations + +Run the script in `apply` mode to fetch missing strings and write translated files directly to their respective `values-{locale}` folders. + +```bash +# Basic usage +python translate.py --mode apply --locales es,de,fr + +# Using a specific model and fine-tuning batch parameters +python translate.py \ + --mode apply \ + --repo-root . \ + --locales ar \ + --model gemma-3-27b-it \ + --batch-size 15 \ + --request-delay 4.0 +``` + +### Checking for Missing Translations + +Run the script in `check` mode inside CI/CD workflows to simply verify whether all strings are translated without making any actual API calls or file modifications. + +```bash +python translate.py --mode check --locales es,de,fr +``` +*In `check` mode, the script exits with code `2` if translations are missing.* + +## Available Command-Line Arguments + +- `--mode` (Required): `apply` (to translate and write xml) or `check` (to only check for missing keys). +- `--locales`: A comma-separated list of target Android language/region codes (e.g. `es,fr,de,ar`). Default is `es,de`. +- `--repo-root`: The path to the root of the Android project (where to search for `src/*/res/values/strings.xml` or Compose Multiplatform equivalent). Default is `.`. +- `--model`: The Gemini API model to use. Default is `gemini-2.0-flash`. +- `--batch-size`: Number of strings to send in a single Gemini API request. Default is `20` (capped at `15` for Gemma models). +- `--request-delay`: Delay in seconds between API requests to prevent immediate rate-limiting. Default is `2.0` (forced to `4.0` for Gemma models). +- `--api-key-env`: Name of the environment variable used to retrieve the API key. Default is `GEMINI_API_KEY`. +- `--no-validate`: Disable automatic malformed XML checks after writing translations. +- `--verbose` / `-v`: Enable debug-level logging. + +## Under The Hood + +### 1. Snapshot Tracking +When you successfully translate strings, the script saves a JSON file in `.translation_snapshots/` within the source module. Subsequent runs will compare current source text against these hashes, allowing `translate.py` to seamlessly fix previously translated strings if you tweak the original English wording. + +### 2. Orphaned Translations cleanup +In `apply` mode, if a developer deletes a string or an array item from the english source, the script reliably detects and strips the orphaned translation from all localized strings files to avoid accumulation of unused strings. + +### 3. Rate Limit Handling +If the Google Gemini backend responds with `429 Rate limited` or `503 Service Unavailable`, `translate.py` will automatically backoff and retry according to `--max-retries` and the wait times embedded in API responses. diff --git a/translate.py b/translate.py new file mode 100644 index 00000000..57cbc5fc --- /dev/null +++ b/translate.py @@ -0,0 +1,2130 @@ +#!/usr/bin/env python3 +""" +Android String Resource Translator + +Production-ready translation of Android string resources using Google Gemini API. + +Features: +- Comment preservation (copies comments from source file exactly) +- Spacing preservation (maintains blank lines and structure from source) +- Placeholder preservation with validation (%s, %1$s, etc.) +- Markup tag preservation with validation (, , etc.) +- Token order validation (not just presence) +- Source attribute propagation (formatted, product, tools:*) +- Conditional placeholder handling for formatted="false" strings +- Whitespace preservation (no stripping of source text) +- HTML entity conversion (case-insensitive) +- Android special character escaping +- Proper xliff namespace handling for AAPT2 compatibility +- Batch translation with individual fallback +- Better 503/overload error handling +- Change detection via snapshot tracking (re-translates modified strings) +- Comprehensive validation and error handling + +Usage: + python translate.py --mode check --locales es,de,fr + python translate.py --mode apply --locales es,de,fr + python translate.py --mode apply --locales ar --model gemma-3-27b-it --batch-size 15 + +Environment: + GEMINI_API_KEY=your_api_key_here +""" + +from __future__ import annotations + +import argparse +import copy +import hashlib +import json +import logging +import os +import re +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, FrozenSet, List, Optional, Set, Tuple + +from lxml import etree as ET +from google import genai +from google.genai import types + +# ============================================================================ +# Logging Configuration +# ============================================================================ + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +logger = logging.getLogger(__name__) + +# ============================================================================ +# Constants & Namespaces +# ============================================================================ + +XML_PARSER = ET.XMLParser( + remove_blank_text=False, + remove_comments=False, + strip_cdata=False, +) + +XLIFF_NAMESPACE = "urn:oasis:names:tc:xliff:document:1.2" +TOOLS_NAMESPACE = "http://schemas.android.com/tools" + +ET.register_namespace("xliff", XLIFF_NAMESPACE) +ET.register_namespace("tools", TOOLS_NAMESPACE) + +DEFAULT_EXCLUDE_DIRS: FrozenSet[str] = frozenset({ + ".git", ".gradle", "build", ".idea", "node_modules", + "__pycache__", "venv", ".venv", ".svn", ".hg", "target", + "bin", "obj", ".dart_tool", ".pub-cache", +}) + +PROPAGATE_ATTRIBUTES: FrozenSet[str] = frozenset({ + "formatted", + "product", +}) + +ALLOWED_TAGS: FrozenSet[str] = frozenset({ + "b", "i", "u", "s", "strike", "del", "ins", + "strong", "em", "cite", "dfn", "code", "samp", "kbd", "var", + "big", "small", "sup", "sub", "tt", + "a", "font", "annotation", "span", + "xliff:g", "g", +}) + +SNAPSHOT_DIR_NAME = ".translation_snapshots" + +# ============================================================================ +# Regex Patterns +# ============================================================================ + +PLACEHOLDER_PATTERNS = [ + r"%%", + r"%n", + r"\\n", + r"\\t", + r"%(?:\d+\$)?[-+# 0,(]*\d*(?:\.\d+)?[sdbBhHoOxXeEfgGaAcC]", + r"%(?:\d+\$)?[-+# 0,(]*\d*(?:\.\d+)?t[HIklMSLNpzZsQBbhAaCYyjmdeRTrDFc]", +] +PLACEHOLDER_RE = re.compile("|".join(PLACEHOLDER_PATTERNS)) + +XLIFF_TAG_RE = re.compile( + r"]*>.*?|]*/\s*>", + re.DOTALL | re.IGNORECASE, +) + +MARKUP_TAG_RE = re.compile(r"]*>") +MARKUP_PATTERN = re.compile(r"]*>") +TAG_NAME_PATTERN = re.compile(r" bool: + return self.attributes.get("formatted", "true").lower() != "false" + + def get_propagated_attributes(self) -> Dict[str, str]: + """Get attributes to propagate to translation (name, formatted, product, tools:*).""" + result = {"name": self.key} + for attr in PROPAGATE_ATTRIBUTES: + if attr in self.attributes: + result[attr] = self.attributes[attr] + for key, value in self.attributes.items(): + if key.startswith(f"{{{TOOLS_NAMESPACE}}}") or key.startswith("tools:"): + result[key] = value + return result + +@dataclass +class StringArrayEntry: + """A string-array resource entry with ordered items.""" + key: str + items: List[str] + attributes: Dict[str, str] = field(default_factory=dict) + + def flat_entries(self) -> List[StringEntry]: + """Flatten into individual StringEntry objects for translation.""" + return [ + StringEntry( + key=f"{self.key}__item_{i}", + text=text, + attributes=self.attributes, + ) + for i, text in enumerate(self.items) + if text and text.strip() + ] + + def content_for_hash(self) -> str: + """Combined content for snapshot hashing.""" + return "||".join(self.items) + + +@dataclass +class PluralsEntry: + """A plurals resource entry with quantity variants.""" + key: str + items: Dict[str, str] # quantity -> text + attributes: Dict[str, str] = field(default_factory=dict) + + def flat_entries(self) -> List[StringEntry]: + """Flatten into individual StringEntry objects for translation.""" + return [ + StringEntry( + key=f"{self.key}__plural_{quantity}", + text=text, + attributes=self.attributes, + ) + for quantity, text in self.items.items() + if text and text.strip() + ] + + def content_for_hash(self) -> str: + """Combined content for snapshot hashing.""" + return "||".join(f"{q}={t}" for q, t in sorted(self.items.items())) + + +@dataclass +class SourceResources: + """All translatable resources from a single source file.""" + strings: List[StringEntry] = field(default_factory=list) + string_arrays: List[StringArrayEntry] = field(default_factory=list) + plurals: List[PluralsEntry] = field(default_factory=list) + + @property + def total_count(self) -> int: + return ( + len(self.strings) + + sum(len(a.items) for a in self.string_arrays) + + sum(len(p.items) for p in self.plurals) + ) + + @property + def is_empty(self) -> bool: + return not self.strings and not self.string_arrays and not self.plurals + + def all_flat_entries(self) -> List[StringEntry]: + """All translatable items flattened into StringEntry list.""" + entries: List[StringEntry] = list(self.strings) + for arr in self.string_arrays: + entries.extend(arr.flat_entries()) + for plu in self.plurals: + entries.extend(plu.flat_entries()) + return entries + + def all_keys_for_snapshot(self) -> Dict[str, str]: + """Build key -> hash mapping for snapshot tracking.""" + data: Dict[str, str] = {} + for s in self.strings: + data[s.key] = content_hash(s.text) + for a in self.string_arrays: + data[f"__array__{a.key}"] = content_hash(a.content_for_hash()) + for p in self.plurals: + data[f"__plurals__{p.key}"] = content_hash(p.content_for_hash()) + return data + +@dataclass +class ExistingKeys: + """Track which keys already exist in a target file.""" + strings: Set[str] = field(default_factory=set) + string_arrays: Set[str] = field(default_factory=set) + plurals: Set[str] = field(default_factory=set) + + @property + def all_string_keys(self) -> Set[str]: + return self.strings + +@dataclass +class FrozenText: + """Text with placeholders and markup tags replaced by tokens.""" + original: str + frozen: str + placeholders: List[str] + tags: List[str] + + def unfreeze(self, translated_frozen: str) -> str: + result = translated_frozen + for i, ph in enumerate(self.placeholders): + result = result.replace(f"[[PH_{i}]]", ph) + for i, tag in enumerate(self.tags): + result = result.replace(f"[[TAG_{i}]]", tag) + return result + + def validate(self, translated_frozen: str) -> Tuple[bool, List[str]]: + errors: List[str] = [] + for i, ph in enumerate(self.placeholders): + token = f"[[PH_{i}]]" + if token not in translated_frozen: + errors.append(f"Missing placeholder {token} (was: {ph})") + for i, tag in enumerate(self.tags): + token = f"[[TAG_{i}]]" + if token not in translated_frozen: + tag_preview = tag[:40] + "..." if len(tag) > 40 else tag + errors.append(f"Missing tag {token} (was: {tag_preview})") + expected_tokens = TOKEN_SEQUENCE_RE.findall(self.frozen) + actual_tokens = TOKEN_SEQUENCE_RE.findall(translated_frozen) + if expected_tokens != actual_tokens and not errors: + errors.append(f"Token order changed: expected {expected_tokens}, got {actual_tokens}") + return len(errors) == 0, errors + + @property + def has_tokens(self) -> bool: + return bool(self.placeholders or self.tags) + + @property + def token_count(self) -> int: + return len(self.placeholders) + len(self.tags) + + +@dataclass +class LocaleResult: + """Translation results for a single locale and source file.""" + locale: str + source_path: Path + target_path: Path + total_source: int = 0 + already_translated: int = 0 + newly_translated: int = 0 + changed_count: int = 0 + failed: int = 0 + errors: List[str] = field(default_factory=list) + + @property + def missing_before(self) -> int: + return self.total_source - self.already_translated + self.changed_count + + +@dataclass +class ProcessingResult: + """Overall processing results across all files and locales.""" + locale_results: List[LocaleResult] = field(default_factory=list) + + @property + def total_missing_before(self) -> int: + return sum(r.missing_before for r in self.locale_results) + + @property + def total_translated(self) -> int: + return sum(r.newly_translated for r in self.locale_results) + + @property + def total_changed(self) -> int: + return sum(r.changed_count for r in self.locale_results) + + @property + def total_failed(self) -> int: + return sum(r.failed for r in self.locale_results) + + @property + def has_missing(self) -> bool: + return (self.total_missing_before - self.total_translated) > 0 + + @property + def has_failures(self) -> bool: + return self.total_failed > 0 + + +# ============================================================================ +# Snapshot Tracking Functions (Minimal Hash-Only) +# ============================================================================ + + +def content_hash(text: str) -> str: + """Generate short hash of string content for change detection.""" + return hashlib.sha256(text.encode('utf-8')).hexdigest()[:12] + + +def get_snapshot_path(source_xml: Path, repo_root: Path) -> Path: + """Get snapshot file path at module level.""" + parts = source_xml.parts + + if "src" in parts: + src_index = parts.index("src") + module_root = Path(*parts[:src_index]) + relative_parts = parts[src_index:] + safe_name = "_".join(relative_parts) + return module_root / ".translation_snapshots" / f"{safe_name}.json" + + try: + relative = source_xml.relative_to(repo_root) + safe_name = str(relative).replace("/", "_").replace("\\", "_") + except ValueError: + safe_name = source_xml.name + return repo_root / ".translation_snapshots" / f"{safe_name}.json" + + +def load_snapshot(snapshot_path: Path) -> Dict[str, str]: + """Load snapshot: key -> hash mapping.""" + if not snapshot_path.exists(): + return {} + try: + content = snapshot_path.read_text(encoding='utf-8') + data = json.loads(content) + if isinstance(data, dict): + return {str(k): str(v) for k, v in data.items()} + return {} + except (json.JSONDecodeError, IOError, OSError) as e: + logger.warning(f"Failed to load snapshot {snapshot_path}: {e}") + return {} + + +def save_snapshot_full( + snapshot_path: Path, source_resources: SourceResources +) -> None: + """Save snapshot for all resource types.""" + try: + snapshot_path.parent.mkdir(parents=True, exist_ok=True) + data = source_resources.all_keys_for_snapshot() + snapshot_path.write_text( + json.dumps(data, sort_keys=True, separators=(",", ":")), + encoding="utf-8", + ) + except (IOError, OSError) as e: + logger.warning(f"Failed to save snapshot {snapshot_path}: {e}") + + +def find_changed_resources( + source_resources: SourceResources, + snapshot: Dict[str, str], + existing_keys: ExistingKeys, +) -> List[StringEntry]: + """ + Find ALL changed entries (strings, array items, plural items) + returned as flat StringEntry list for translation. + """ + changed: List[StringEntry] = [] + + # Regular strings + for entry in source_resources.strings: + if entry.key not in snapshot: + continue + if entry.key not in existing_keys.strings: + continue + if snapshot[entry.key] != content_hash(entry.text): + changed.append(entry) + + # String arrays + for arr in source_resources.string_arrays: + snap_key = f"__array__{arr.key}" + if snap_key not in snapshot: + continue + if arr.key not in existing_keys.string_arrays: + continue + if snapshot[snap_key] != content_hash(arr.content_for_hash()): + changed.extend(arr.flat_entries()) + + # Plurals + for plu in source_resources.plurals: + snap_key = f"__plurals__{plu.key}" + if snap_key not in snapshot: + continue + if plu.key not in existing_keys.plurals: + continue + if snapshot[snap_key] != content_hash(plu.content_for_hash()): + changed.extend(plu.flat_entries()) + + return changed + + +def _snapshot_needs_update_full( + snapshot: Dict[str, str], + source_resources: SourceResources, +) -> bool: + """Check if snapshot needs update based on ALL resource types.""" + if not snapshot: + return True + + current_data = source_resources.all_keys_for_snapshot() + + if set(current_data.keys()) != set(snapshot.keys()): + return True + + for key, current_hash in current_data.items(): + if snapshot.get(key) != current_hash: + return True + + return False + + +# ============================================================================ +# Text Freezing Functions +# ============================================================================ + + +def freeze_text(text: str, freeze_placeholders: bool = True) -> FrozenText: + """Replace placeholders and markup tags with tokens for safe translation.""" + frozen = text + placeholders: List[str] = [] + tags: List[str] = [] + + def freeze_xliff(match: re.Match) -> str: + tags.append(match.group(0)) + return f"[[TAG_{len(tags) - 1}]]" + + frozen = XLIFF_TAG_RE.sub(freeze_xliff, frozen) + + def freeze_tag(match: re.Match) -> str: + tags.append(match.group(0)) + return f"[[TAG_{len(tags) - 1}]]" + + frozen = MARKUP_TAG_RE.sub(freeze_tag, frozen) + + if freeze_placeholders: + def freeze_ph(match: re.Match) -> str: + placeholders.append(match.group(0)) + return f"[[PH_{len(placeholders) - 1}]]" + frozen = PLACEHOLDER_RE.sub(freeze_ph, frozen) + + return FrozenText(original=text, frozen=frozen, placeholders=placeholders, tags=tags) + + +# ============================================================================ +# Text Sanitization Functions +# ============================================================================ + + +def convert_html_entities_to_numeric(text: str) -> str: + """Convert HTML named entities to XML numeric entities.""" + def replace_entity(match: re.Match) -> str: + name = match.group(1).lower() + return HTML_ENTITY_TO_NUMERIC.get(name, match.group(0)) + return HTML_ENTITY_PATTERN.sub(replace_entity, text) + + +def fix_bare_ampersands(text: str) -> str: + """Replace bare ampersands with & for XML validity.""" + return BARE_AMPERSAND_PATTERN.sub("&", text) + + +def sanitize_for_xml_parse(text: str) -> str: + """Prepare text for XML parsing.""" + result = convert_html_entities_to_numeric(text) + return fix_bare_ampersands(result) + + +def escape_android_string(text: str) -> str: + """Escape Android special characters in string resources.""" + if not text: + return text + result: List[str] = [] + i = 0 + length = len(text) + while i < length: + char = text[i] + if char == '\\' and i + 1 < length: + next_char = text[i + 1] + if next_char in ("'", '"', '\\', 'n', 't', 'r', '@', '?'): + result.append(char) + result.append(next_char) + i += 2 + continue + if next_char == 'u' and i + 5 <= length: + hex_chars = text[i + 2:i + 6] + if len(hex_chars) == 4 and all(c in '0123456789abcdefABCDEF' for c in hex_chars): + result.append(text[i:i + 6]) + i += 6 + continue + if char == "'": + result.append("\\'") + elif char == '@' and i == 0: + result.append('\\@') + elif char == '?' and i == 0: + result.append('\\?') + else: + result.append(char) + i += 1 + return ''.join(result) + + +def escape_android_text_nodes(element: ET._Element) -> None: + """Recursively escape Android special characters in text and tail content.""" + if element.text: + element.text = escape_android_string(element.text) + for child in element: + if not callable(child.tag): + escape_android_text_nodes(child) + if child.tail: + child.tail = escape_android_string(child.tail) + + +def validate_allowed_tags(value: str) -> Tuple[bool, List[str]]: + """Check if all markup tags in value are in the allowlist.""" + if not MARKUP_PATTERN.search(value): + return True, [] + found = set(TAG_NAME_PATTERN.findall(value)) + unknown = [t for t in found if t.lower() not in ALLOWED_TAGS] + return len(unknown) == 0, unknown + + +# ============================================================================ +# XML Helper Functions +# ============================================================================ + + +def is_comment(elem) -> bool: + """Check if element is a comment (lxml comments have callable tag).""" + return callable(elem.tag) + + +def get_comment_text(elem) -> str: + """Get the text content of a comment element.""" + if is_comment(elem): + return elem.text or "" + return "" + + +def get_element_full_text(elem: ET._Element) -> str: + """Get full text content including child elements as markup.""" + parts: List[str] = [] + if elem.text: + parts.append(elem.text) + for child in elem: + if not is_comment(child): + parts.append(ET.tostring(child, encoding="unicode")) + if child.tail: + parts.append(child.tail) + return "".join(parts) + + +# ============================================================================ +# XML Reading Functions +# ============================================================================ + + +def read_source_resources(source_xml: Path) -> SourceResources: + """Read ALL translatable resources from source XML.""" + tree = ET.parse(str(source_xml), parser=XML_PARSER) + root = tree.getroot() + resources = SourceResources() + + for node in root: + if is_comment(node): + continue + + # ── ────────────────────────────────────────── + if node.tag == "string": + name = node.get("name") + if not name: + continue + if node.get("translatable", "true").lower() == "false": + continue + raw_text = get_element_full_text(node) + if not raw_text or not raw_text.strip(): + continue + preserved = _extract_propagated_attrs(node) + resources.strings.append( + StringEntry(key=name, text=raw_text, attributes=preserved) + ) + + # ── ──────────────────────────────────── + elif node.tag == "string-array": + name = node.get("name") + if not name: + continue + if node.get("translatable", "true").lower() == "false": + continue + items: List[str] = [] + for item_node in node.iter("item"): + item_text = get_element_full_text(item_node) + items.append(item_text or "") + if not any(t.strip() for t in items): + continue + preserved = _extract_propagated_attrs(node) + resources.string_arrays.append( + StringArrayEntry(key=name, items=items, attributes=preserved) + ) + + # ── ───────────────────────────────────────── + elif node.tag == "plurals": + name = node.get("name") + if not name: + continue + if node.get("translatable", "true").lower() == "false": + continue + quantity_map: Dict[str, str] = {} + for item_node in node.iter("item"): + quantity = item_node.get("quantity") + if quantity: + item_text = get_element_full_text(item_node) + if item_text: + quantity_map[quantity] = item_text + if not quantity_map: + continue + preserved = _extract_propagated_attrs(node) + resources.plurals.append( + PluralsEntry(key=name, items=quantity_map, attributes=preserved) + ) + + return resources + + +def _extract_propagated_attrs(node: ET._Element) -> Dict[str, str]: + """Extract attributes to propagate from a source node.""" + preserved: Dict[str, str] = {} + for attr_key, attr_val in node.attrib.items(): + if attr_key in ("name", "translatable"): + continue + if attr_key in PROPAGATE_ATTRIBUTES: + preserved[attr_key] = attr_val + elif attr_key.startswith(f"{{{TOOLS_NAMESPACE}}}"): + preserved[attr_key] = attr_val + return preserved + + +def read_existing_keys_full(target_xml: Path) -> ExistingKeys: + """Read existing resource keys from target file (all types).""" + result = ExistingKeys() + if not target_xml.exists(): + return result + try: + tree = ET.parse(str(target_xml), parser=XML_PARSER) + root = tree.getroot() + result.strings = set(root.xpath("./string/@name")) + result.string_arrays = set(root.xpath("./string-array/@name")) + result.plurals = set(root.xpath("./plurals/@name")) + return result + except ET.XMLSyntaxError: + return result + +# ============================================================================ +# XML Writing Functions +# ============================================================================ + + +def set_mixed_string_value( + node: ET._Element, + value: str, + key: Optional[str] = None, + warn_unknown_tags: bool = True, +) -> None: + """Set string node value, preserving embedded markup.""" + node.text = None + for child in list(node): + node.remove(child) + + key_prefix = f"[{key}] " if key else "" + + if warn_unknown_tags and MARKUP_PATTERN.search(value): + is_valid, unknown = validate_allowed_tags(value) + if not is_valid: + logger.warning(f"{key_prefix}Unknown tags (may not render): {unknown}") + + if not MARKUP_PATTERN.search(value): + converted = convert_html_entities_to_numeric(value) + node.text = escape_android_string(converted) + return + + sanitized = sanitize_for_xml_parse(value) + wrapped = f"<_root xmlns:xliff='{XLIFF_NAMESPACE}'>{sanitized}" + + try: + fragment = ET.fromstring(wrapped.encode('utf-8')) + except ET.XMLSyntaxError as e: + logger.warning(f"{key_prefix}XML parse failed, using plain text: {e}") + fallback = convert_html_entities_to_numeric(value) + node.text = escape_android_string(fallback) + return + + node.text = fragment.text + for child in list(fragment): + fragment.remove(child) + node.append(child) + + escape_android_text_nodes(node) + + +def write_translations_full( + target_xml: Path, + translations: Dict[str, str], # flat key -> translated text + source_resources: SourceResources, + source_xml: Path, + validate: bool = True, + warn_unknown_tags: bool = True, +) -> int: + """ + Write translations including string-arrays and plurals. + + The `translations` dict uses flat keys: + - "key" -> string translation + - "key__item_0" -> string-array item + - "key__plural_one" -> plurals quantity variant + """ + target_xml.parent.mkdir(parents=True, exist_ok=True) + + source_tree = ET.parse(str(source_xml), parser=XML_PARSER) + source_root = source_tree.getroot() + + if target_xml.exists(): + try: + existing_tree = ET.parse(str(target_xml), parser=XML_PARSER) + existing_root = existing_tree.getroot() + return _merge_all_into_existing( + target_xml, existing_root, translations, + source_resources, source_root, validate, warn_unknown_tags + ) + except ET.XMLSyntaxError as e: + logger.warning(f"Corrupted '{target_xml}', recreating: {e}") + + return _create_from_source_full( + target_xml, translations, source_resources, + source_root, validate, warn_unknown_tags + ) + + +def _cleanup_orphaned_translations( + target_xml: Path, + source_resources: SourceResources, +) -> int: + """ + Remove entries from target file that no longer exist in source. + Returns count of removed entries. + """ + if not target_xml.exists(): + return 0 + + try: + tree = ET.parse(str(target_xml), parser=XML_PARSER) + root = tree.getroot() + except ET.XMLSyntaxError: + return 0 + + source_string_keys: Set[str] = {e.key for e in source_resources.strings} + source_array_keys: Set[str] = {a.key for a in source_resources.string_arrays} + source_plural_keys: Set[str] = {p.key for p in source_resources.plurals} + + + elements_to_remove: List[ET._Element] = [] + removed_names: List[str] = [] + + for elem in list(root): + if is_comment(elem): + continue + + name = elem.get("name") + if not name: + continue + + if elem.tag == "string": + if name not in source_string_keys: + elements_to_remove.append(elem) + removed_names.append(f"string:{name}") + + elif elem.tag == "string-array": + if name not in source_array_keys: + elements_to_remove.append(elem) + removed_names.append(f"string-array:{name}") + + elif elem.tag == "plurals": + if name not in source_plural_keys: + elements_to_remove.append(elem) + removed_names.append(f"plurals:{name}") + + if not elements_to_remove: + return 0 + + for elem in elements_to_remove: + _remove_element_and_orphaned_comments(root, elem) + + _normalize_resource_whitespace(root) + + children = list(root) + if children: + for child in reversed(children): + if not is_comment(child): + if not child.tail or not child.tail.endswith("\n"): + child.tail = "\n" + break + + ET.cleanup_namespaces(root) + tree = ET.ElementTree(root) + tree.write( + str(target_xml), + encoding="utf-8", + xml_declaration=True, + pretty_print=False, + ) + _fix_xliff_namespaces_in_file(target_xml) + + for name in removed_names: + logger.info(f" ✕ Removed orphaned: {name}") + + return len(elements_to_remove) + +def _remove_element_and_orphaned_comments( + root: ET._Element, elem: ET._Element +) -> None: + """ + Remove element AND any preceding comments that would become orphaned. + + Example: if removing the last string under comment, + remove the comment too. + """ + parent = elem.getparent() + if parent is None: + return + + prev = elem.getprevious() + + _remove_element_preserve_whitespace(root, elem) + + if prev is not None and is_comment(prev): + next_sibling = prev.getnext() + if next_sibling is None or is_comment(next_sibling): + _remove_element_preserve_whitespace(root, prev) + +def _create_from_source_full( + target_xml: Path, + translations: Dict[str, str], + source_resources: SourceResources, + source_root: ET._Element, + validate: bool, + warn_unknown_tags: bool, +) -> int: + """Create new file from source, filling in all resource types.""" + root = copy.deepcopy(source_root) + + # Build lookup sets + translated_string_keys: Set[str] = set() + translated_array_keys: Set[str] = set() + translated_plural_keys: Set[str] = set() + + for flat_key in translations: + if "__item_" in flat_key: + base_key = flat_key.rsplit("__item_", 1)[0] + translated_array_keys.add(base_key) + elif "__plural_" in flat_key: + base_key = flat_key.rsplit("__plural_", 1)[0] + translated_plural_keys.add(base_key) + else: + translated_string_keys.add(flat_key) + + elements_to_remove: List[ET._Element] = [] + written = 0 + + for elem in list(root): + if is_comment(elem): + continue + + name = elem.get("name") + if not name: + continue + + if elem.get("translatable", "true").lower() == "false": + continue + + if elem.tag == "string": + if name in translated_string_keys: + value = translations[name] + elem.text = None + for child in list(elem): + elem.remove(child) + set_mixed_string_value( + elem, value, key=name, + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 + else: + elements_to_remove.append(elem) + + elif elem.tag == "string-array": + if name in translated_array_keys: + item_nodes = list(elem.iter("item")) + for i, item_node in enumerate(item_nodes): + flat_key = f"{name}__item_{i}" + if flat_key in translations: + value = translations[flat_key] + item_node.text = None + for child in list(item_node): + item_node.remove(child) + set_mixed_string_value( + item_node, value, key=flat_key, + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 + else: + elements_to_remove.append(elem) + + elif elem.tag == "plurals": + if name in translated_plural_keys: + for item_node in elem.iter("item"): + quantity = item_node.get("quantity") + if quantity: + flat_key = f"{name}__plural_{quantity}" + if flat_key in translations: + value = translations[flat_key] + item_node.text = None + for child in list(item_node): + item_node.remove(child) + set_mixed_string_value( + item_node, value, key=flat_key, + warn_unknown_tags=warn_unknown_tags, + ) + written += 1 + else: + elements_to_remove.append(elem) + + for elem in elements_to_remove: + _remove_element_preserve_whitespace(root, elem) + + _normalize_resource_whitespace(root) + + ET.cleanup_namespaces(root) + tree = ET.ElementTree(root) + tree.write(str(target_xml), encoding="utf-8", + xml_declaration=True, pretty_print=False) + _fix_xliff_namespaces_in_file(target_xml) + + if validate: + try: + ET.parse(str(target_xml), parser=XML_PARSER) + except ET.XMLSyntaxError as e: + raise XmlWriteError(f"Written file is malformed: {target_xml}: {e}") + + return written + + +def _fix_xliff_namespaces_in_file(target_xml: Path) -> None: + """ + Post-process the written XML file to fix xliff namespace issues and formatting. + + lxml may generate auto-prefixed namespaces (ns0, ns1, etc.) instead of + using the proper 'xliff' prefix. This function: + - Fixes XML declaration to use double quotes and lowercase encoding + - Adds copyright header if missing + - Replaces ns#: prefixes with xliff: for XLIFF namespace + - Removes inline xmlns:ns# declarations for XLIFF + - Ensures xliff namespace is declared at root level + """ + content = target_xml.read_text(encoding='utf-8') + original_content = content + + # Fix XML declaration: single quotes to double quotes, uppercase to lowercase + content = re.sub( + r"<\?xml version='1\.0' encoding='UTF-8'\?>", + '', + content + ) + + content = re.sub( + r'(-->)\s*()\s*(''' + + if '