#!/usr/bin/python3
"""
Remove remote assets from documentation files.

This tool strips remotely-hosted images, iframes, and tracking scripts from
documentation sources before they are turned into HTML and installed under
/usr/share/doc/.
"""

from __future__ import annotations

import argparse
import hashlib
import os
from pathlib import Path
import re
import shutil
import sys
import textwrap


HTML_IMG_RE = re.compile(r'<img\b[^>]*\bsrc=["\']([^"\']+)["\'][^>]*>', re.I)
HTML_IFRAME_RE = re.compile(
    r'<iframe\b[^>]*\bsrc=["\']([^"\']+)["\'][^>]*>.*?</iframe>',
    re.I | re.S,
)
HTML_TWITTER_WIDGETS_SCRIPT_RE = re.compile(
    r"<script\b[^>]*>.*?\+'\:\/\/platform\.twitter\.com\/widgets\.js'.*?</script>",
    re.I | re.S,
)
HTML_PIWIK_SCRIPT_RE = re.compile(
    r"<script\b[^>]*>.*?setTrackerUrl.*?</script>", re.I | re.S
)
RST_SUB_DEF_RE = re.compile(r"^\s*\.\.\s+\|([^|]+)\|\s+(?:figure|image)::\s+(\S+)\s*$")
__version__ = "0.2"


class Options:
    def __init__(
        self,
        excludes: list[str],
        no_act: bool,
        verbose: bool,
        sourcedir: Path | None,
    ) -> None:
        self.excludes = excludes
        self.no_act = no_act
        self.verbose = verbose
        self.sourcedir = sourcedir


def error(message: str) -> None:
    sys.stderr.write(f"dh_doc_privacy: error: {message}\n")
    raise SystemExit(1)


def verbose_print(message: str, options: Options) -> None:
    if options.verbose or options.no_act:
        print(message, file=sys.stderr)


def parse_args(argv: list[str]) -> tuple[Options, list[str]]:
    description = textwrap.dedent(
        """\
        Remove remote badge images, iframes, and tracking scripts from
        documentation files before they are shipped in a Debian package.

        By default the command scans the top-level README.md, README.rst, and
        README.html files. You can also pass explicit files or directories to
        process.
        """
    )
    parser = argparse.ArgumentParser(
        add_help=True,
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument(
        "paths",
        nargs="*",
        metavar="PATH",
        help="file or directory to process; defaults to top-level README files",
    )
    parser.add_argument(
        "-X",
        "--exclude",
        action="append",
        default=[],
        metavar="PATTERN",
        help="skip files whose path contains PATTERN; may be specified multiple times",
    )
    parser.add_argument(
        "--no-act",
        "-n",
        action="store_true",
        help="report files that would be changed without modifying them",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="print a message for each file that is examined or changed",
    )
    parser.add_argument(
        "--version", action="version", version=f"%(prog)s {__version__}"
    )
    parser.add_argument(
        "--sourcedir",
        metavar="DIR",
        help="treat DIR as the package source root instead of the current directory",
    )
    parsed, rest = parser.parse_known_args(argv)
    unknown = [arg for arg in rest if arg.startswith("-")]
    if unknown:
        error(f"unknown option(s): {' '.join(unknown)}")
    sourcedir = Path(parsed.sourcedir) if parsed.sourcedir else None
    options = Options(
        excludes=parsed.exclude,
        no_act=parsed.no_act,
        verbose=parsed.verbose,
        sourcedir=sourcedir,
    )
    return options, parsed.paths


def excludefile(path: str, options: Options) -> bool:
    return any(exclude in path for exclude in options.excludes)


def is_remote_url(url: str) -> bool:
    value = url.strip().strip("<>")
    return re.match(r"^(https?:)?//", value, re.I) is not None


def remove_html_blocks(lines: list[str]) -> list[str]:
    result: list[str] = []
    idx = 0
    while idx < len(lines):
        line = lines[idx]
        start_tag: str | None = None
        for tag in ("table", "p", "div", "iframe"):
            if f"<{tag}" in line:
                start_tag = tag
                break
        if start_tag is None:
            result.append(line)
            idx += 1
            continue
        end_tag = f"</{start_tag}>"
        block = [line]
        idx += 1
        while idx < len(lines) and end_tag not in lines[idx]:
            block.append(lines[idx])
            idx += 1
        if idx < len(lines):
            block.append(lines[idx])
            idx += 1
        block_text = "\n".join(block)
        has_remote = False
        has_local = False
        for match in HTML_IMG_RE.finditer(block_text):
            if is_remote_url(match.group(1)):
                has_remote = True
            else:
                has_local = True
        for match in HTML_IFRAME_RE.finditer(block_text):
            if is_remote_url(match.group(1)):
                has_remote = True
            else:
                has_local = True
        if has_remote and not has_local:
            stripped = re.sub(r"<[^>]+>|&[a-zA-Z0-9#]+;", "", block_text)
            if stripped.strip() == "":
                result.append("__BADGE_BLOCK_REMOVED__")
                if idx < len(lines) and not lines[idx].strip():
                    idx += 1
                continue
        result.extend(block)
    return result


def remove_markdown_images(line: str) -> str:
    def replace_link(match: re.Match[str]) -> str:
        return "" if is_remote_url(match.group(1)) else match.group(0)

    line = re.sub(
        r"\[!\[[^\]]*\]\(([^)]+)\)\]\([^)]+\)",
        replace_link,
        line,
        flags=re.I | re.S,
    )
    line = re.sub(r"!\[[^\]]*\]\(([^)]+)\)", replace_link, line, flags=re.I | re.S)
    return line


def remove_html_images(line: str) -> str:
    def replace_html(match: re.Match[str]) -> str:
        return "" if is_remote_url(match.group(1)) else match.group(0)

    line = re.sub(HTML_IMG_RE, replace_html, line)
    line = re.sub(HTML_IFRAME_RE, replace_html, line)
    return line


def remove_html_scripts_text(text: str) -> str:
    text = re.sub(HTML_TWITTER_WIDGETS_SCRIPT_RE, "", text)
    text = re.sub(HTML_PIWIK_SCRIPT_RE, "", text)
    return text


def remove_rst_badges(lines: list[str]) -> list[str]:
    removed_names: dict[str, bool] = {}
    result: list[str] = []
    idx = 0
    while idx < len(lines):
        line = lines[idx]
        match = RST_SUB_DEF_RE.match(line)
        if match and is_remote_url(match.group(2)):
            removed_names[match.group(1)] = True
            idx += 1
            while idx < len(lines) and lines[idx].startswith((" ", "\t")):
                idx += 1
            if idx < len(lines) and not lines[idx].strip():
                idx += 1
            continue
        inline_match = re.match(r"^\s*\.\.\s+(?:figure|image)::\s+(\S+)\s*$", line)
        if inline_match and is_remote_url(inline_match.group(1)):
            idx += 1
            while idx < len(lines) and lines[idx].startswith((" ", "\t")):
                idx += 1
            if idx < len(lines) and not lines[idx].strip():
                idx += 1
            continue
        result.append(line)
        idx += 1

    final: list[str] = []
    removed_prev = False
    for i, original in enumerate(result):
        line = original
        is_badge_block_line = re.match(r"^\s*\|", original) is not None
        neighbor_badge = False
        if is_badge_block_line:
            if i > 0 and re.match(r"^\s*\|", result[i - 1]):
                neighbor_badge = True
            if i + 1 < len(result) and re.match(r"^\s*\|", result[i + 1]):
                neighbor_badge = True
        if is_badge_block_line and neighbor_badge:
            final.append(original)
            continue
        for name in removed_names.keys():
            line = line.replace(f"|{name}|", "")
        if line.strip() and line.strip("|").strip() == "":
            removed_prev = True
            continue
        if original.strip() and not line.strip():
            removed_prev = True
            continue
        if removed_prev and not line.strip():
            removed_prev = False
            if final and final[-1].strip():
                final.append(line)
            continue
        removed_prev = False
        final.append(line)
    return final


def remove_markdown_badges(text: str) -> str:
    text = remove_html_scripts_text(text)
    had_trailing_newline = text.endswith("\n")
    lines = text.split("\n")
    lines = remove_html_blocks(lines)
    cleaned: list[str] = []
    removed_prev = False
    idx = 0
    while idx < len(lines):
        original = lines[idx]
        if original == "__BADGE_BLOCK_REMOVED__":
            removed_prev = True
            idx += 1
            continue
        line = remove_markdown_images(original)
        line = remove_html_images(line)
        if original.strip() and not line.strip():
            removed_prev = True
            idx += 1
            continue
        if removed_prev:
            if not line.strip():
                removed_prev = False
                idx += 1
                continue
            removed_prev = False
        cleaned.append(line)
        idx += 1
    while cleaned and not cleaned[0].strip():
        cleaned.pop(0)
    while cleaned and not cleaned[-1].strip():
        cleaned.pop()
    output = "\n".join(cleaned)
    if had_trailing_newline:
        output += "\n"
    return output


def remove_rst_badges_text(text: str) -> str:
    text = remove_html_scripts_text(text)
    lines = text.split("\n")
    lines = remove_html_blocks(lines)
    lines = remove_rst_badges(lines)
    return "\n".join(remove_html_images(line) for line in lines)


def remove_badges(text: str, path: str) -> str:
    if path.lower().endswith(".rst"):
        return remove_rst_badges_text(text)
    cleaned = remove_markdown_badges(text)
    return remove_rst_badges_text(cleaned)


def restore_file_on_clean(path: Path, options: Options) -> None:
    if options.no_act:
        return
    if path.is_absolute():
        error("restore_file_on_clean requires a path relative to the package dir")
    bucket_index = Path("debian/.debhelper/bucket/index")
    bucket_dir = Path("debian/.debhelper/bucket/files")
    bucket_dir.mkdir(parents=True, exist_ok=True)
    cleaned = Path(str(path).lstrip("./"))
    normalized = Path(str(cleaned).replace("//", "/"))
    normalized_str = str(normalized)
    if (
        normalized_str.startswith(".")
        or "/CVS/" in normalized_str
        or "/." in normalized_str
    ):
        error("Attempt to store a hidden or VCS file path")
    if normalized.is_symlink() or not normalized.is_file():
        error("Cannot store non-regular file for restore")
    checksum = hashlib.sha256(normalized.read_bytes()).hexdigest()
    stored_already = False
    if bucket_index.exists():
        for line in bucket_index.read_text(encoding="utf-8").splitlines():
            parts = line.split(" ", 1)
            if len(parts) == 2 and parts[1] == normalized_str:
                stored_already = True
                break
    if not stored_already:
        bucket_file = bucket_dir / checksum
        if not bucket_file.exists():
            tmp_file = bucket_dir / f"{checksum}.tmp"
            shutil.copy2(normalized, tmp_file)
            os.replace(tmp_file, bucket_file)
        bucket_index.parent.mkdir(parents=True, exist_ok=True)
        with bucket_index.open("a", encoding="utf-8") as handle:
            handle.write(f"{checksum} {normalized_str}\n")


def clean_file(path: Path, options: Options) -> None:
    if excludefile(str(path), options):
        return
    try:
        text = path.read_text(encoding="utf-8")
    except OSError as exc:
        error(f"Unable to read {path}: {exc}")
    cleaned = remove_badges(text, str(path))
    if cleaned == text:
        return
    restore_file_on_clean(path, options)
    if options.no_act:
        verbose_print(f"Would update {path}", options)
        return
    try:
        path.write_text(cleaned, encoding="utf-8")
    except OSError as exc:
        error(f"Unable to write {path}: {exc}")


def clean_dir(path: Path, options: Options) -> None:
    for root, _, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith((".md", ".rst", ".html")):
                continue
            clean_file(Path(root) / filename, options)


def run(paths: list[str], options: Options) -> None:
    if paths:
        for path_str in paths:
            path = Path(path_str)
            if not path.exists():
                error(f"No such file: {path_str}")
            if path.is_dir():
                clean_dir(path, options)
                continue
            if not path.is_file():
                error(f"{path_str} is not a file")
            clean_file(path, options)
        return
    for entry in Path(".").iterdir():
        if entry.is_file() and re.match(r"^README\.(md|rst|html)$", entry.name, re.I):
            clean_file(entry, options)


def main(argv: list[str] | None = None) -> int:
    options, paths = parse_args(argv or sys.argv[1:])
    if options.sourcedir:
        os.chdir(options.sourcedir)
    run(paths, options)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
