init mdlink

2026-04-10 15:24:42 +02:00
commit f4fa4fc35e
12 changed files with 697 additions and 0 deletions
@@ -0,0 +1,43 @@
+# Python bytecode and caches
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+
+# Build and packaging artifacts
+build/
+dist/
+*.egg-info/
+.eggs/
+pip-wheel-metadata/
+*.whl
+
+# Test and coverage outputs
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+.coverage.*
+htmlcov/
+.tox/
+.nox/
+
+# Local tooling/runtime files
+.DS_Store
+Thumbs.db
+*.log
+.codex
+
+# IDE/editor settings
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Local test artifacts
+test_preview.md
@@ -0,0 +1,52 @@
+# mdlink
+
+`mdlink` is a CLI tool to recursively scan Markdown files and validate HTTP/HTTPS links.
+
+## Features
+
+- Scans `.md` files recursively in a directory (or a single `.md` file).
+- Extracts Markdown links (`[text](url)`) and naked URLs (`https://...`).
+- Ignores image links (`![alt](url)`).
+- Checks links with `httpx` and follows redirects automatically.
+- Ignores `200 OK` in the report; shows non-200 and request errors.
+- Shows final resolved URL in output.
+- Asks interactively before rewriting redirected Markdown links.
+- Re-checks target URL before writing (must still return `200`).
+- Rewrites links via Markdown AST editing (`markdown-it-py`), not string replacement.
+
+## Installation
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install .
+```
+
+## Usage
+
+```bash
+mdlink /path/to/docs
+```
+
+## Options
+
+- `--timeout FLOAT`  
+  Per-request timeout in seconds (default: `10.0`).
+
+## Interactive Redirect Rewrite
+
+When a Markdown link redirects, `mdlink` prompts:
+
+```text
+Replace old URL with final URL? [y/N]
+```
+
+- [broken](https://httpbin.org/status/404)
+
+Only confirmed links are updated.
+
+## Help
+
+```bash
+mdlink --help
+```
@@ -0,0 +1,5 @@
+"""mdlink package."""
+
+from .cli import main
+
+__all__ = ["main"]
@@ -0,0 +1,217 @@
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import DefaultDict
+
+from markdown_it import MarkdownIt
+from markdown_it.token import Token
+
+
+@dataclass(frozen=True)
+class _LinePatch:
+    old: str
+    new: str
+
+
+class ASTMarkdownEditor:
+    def __init__(self) -> None:
+        self._md = MarkdownIt("commonmark")
+
+    def replace_links(self, content: str, replacements: dict[str, str]) -> str:
+        if not replacements:
+            return content
+
+        tokens = self._md.parse(content)
+        patches_by_line = self._collect_line_patches(tokens=tokens, replacements=replacements)
+        if not patches_by_line:
+            return content
+
+        lines = content.splitlines(keepends=True)
+        for line_index, patches in patches_by_line.items():
+            if line_index < 0 or line_index >= len(lines):
+                continue
+            lines[line_index] = self._rewrite_markdown_links_in_line(lines[line_index], patches)
+        return "".join(lines)
+
+    def _collect_line_patches(
+        self,
+        tokens: list[Token],
+        replacements: dict[str, str],
+    ) -> dict[int, list[_LinePatch]]:
+        patches_by_line: DefaultDict[int, list[_LinePatch]] = defaultdict(list)
+        for token in tokens:
+            if token.type != "inline" or not token.children:
+                continue
+            if not token.map:
+                continue
+            line_index = int(token.map[0])
+            for child in token.children:
+                if child.type != "link_open":
+                    continue
+                href = child.attrGet("href")
+                if not href:
+                    continue
+                new_href = replacements.get(href)
+                if not new_href or new_href == href:
+                    continue
+                patches_by_line[line_index].append(_LinePatch(old=href, new=new_href))
+        return dict(patches_by_line)
+
+    def _rewrite_markdown_links_in_line(self, line: str, patches: list[_LinePatch]) -> str:
+        if not patches:
+            return line
+
+        patch_index = 0
+        chars = list(line)
+        i = 0
+        in_code = False
+        code_ticks = 0
+
+        while i < len(chars) and patch_index < len(patches):
+            char = chars[i]
+
+            if char == "`" and not self._is_escaped(chars, i):
+                run = self._count_char_run(chars, i, "`")
+                if not in_code:
+                    in_code = True
+                    code_ticks = run
+                elif run == code_ticks:
+                    in_code = False
+                    code_ticks = 0
+                i += run
+                continue
+
+            if in_code:
+                i += 1
+                continue
+
+            if char == "[" and not self._is_escaped(chars, i):
+                if i > 0 and chars[i - 1] == "!":
+                    i += 1
+                    continue
+                parsed = self._parse_inline_link(chars, i)
+                if parsed is None:
+                    i += 1
+                    continue
+                start_url, end_url, parsed_url, close_index = parsed
+                patch = patches[patch_index]
+                if parsed_url == patch.old:
+                    replacement = list(patch.new)
+                    chars[start_url:end_url] = replacement
+                    delta = len(replacement) - (end_url - start_url)
+                    close_index += delta
+                    patch_index += 1
+                i = close_index + 1
+                continue
+
+            i += 1
+
+        return "".join(chars)
+
+    def _parse_inline_link(self, chars: list[str], open_bracket: int) -> tuple[int, int, str, int] | None:
+        close_bracket = self._find_link_text_end(chars, open_bracket)
+        if close_bracket is None:
+            return None
+
+        cursor = close_bracket + 1
+        while cursor < len(chars) and chars[cursor] in (" ", "\t"):
+            cursor += 1
+        if cursor >= len(chars) or chars[cursor] != "(":
+            return None
+
+        close_paren = self._find_matching_paren(chars, cursor)
+        if close_paren is None:
+            return None
+
+        dest_start = cursor + 1
+        while dest_start < close_paren and chars[dest_start] in (" ", "\t"):
+            dest_start += 1
+        if dest_start >= close_paren:
+            return None
+
+        if chars[dest_start] == "<":
+            dest_end = dest_start + 1
+            while dest_end < close_paren and chars[dest_end] != ">":
+                dest_end += 1
+            if dest_end >= close_paren:
+                return None
+            url_start = dest_start + 1
+            url_end = dest_end
+        else:
+            url_start = dest_start
+            url_end = self._scan_destination_end(chars, start=dest_start, stop=close_paren)
+            if url_end <= url_start:
+                return None
+
+        parsed_url = "".join(chars[url_start:url_end])
+        return url_start, url_end, parsed_url, close_paren
+
+    def _find_link_text_end(self, chars: list[str], open_bracket: int) -> int | None:
+        depth = 1
+        index = open_bracket + 1
+        while index < len(chars):
+            char = chars[index]
+            if char == "[" and not self._is_escaped(chars, index):
+                depth += 1
+            elif char == "]" and not self._is_escaped(chars, index):
+                depth -= 1
+                if depth == 0:
+                    return index
+            index += 1
+        return None
+
+    def _find_matching_paren(self, chars: list[str], open_paren: int) -> int | None:
+        depth = 1
+        index = open_paren + 1
+        in_quote: str | None = None
+        while index < len(chars):
+            char = chars[index]
+            if in_quote is not None:
+                if char == in_quote and not self._is_escaped(chars, index):
+                    in_quote = None
+                index += 1
+                continue
+            if char in ('"', "'") and not self._is_escaped(chars, index):
+                in_quote = char
+                index += 1
+                continue
+            if char == "(" and not self._is_escaped(chars, index):
+                depth += 1
+            elif char == ")" and not self._is_escaped(chars, index):
+                depth -= 1
+                if depth == 0:
+                    return index
+            index += 1
+        return None
+
+    def _scan_destination_end(self, chars: list[str], start: int, stop: int) -> int:
+        depth = 0
+        index = start
+        while index < stop:
+            char = chars[index]
+            if char in (" ", "\t"):
+                if depth == 0:
+                    break
+            elif char == "(" and not self._is_escaped(chars, index):
+                depth += 1
+            elif char == ")" and not self._is_escaped(chars, index):
+                if depth == 0:
+                    break
+                depth -= 1
+            index += 1
+        return index
+
+    def _is_escaped(self, chars: list[str], pos: int) -> bool:
+        backslashes = 0
+        index = pos - 1
+        while index >= 0 and chars[index] == "\\":
+            backslashes += 1
+            index -= 1
+        return (backslashes % 2) == 1
+
+    def _count_char_run(self, chars: list[str], start: int, char: str) -> int:
+        end = start
+        while end < len(chars) and chars[end] == char:
+            end += 1
+        return end - start
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from typing import Iterable
+
+import httpx
+
+from .models import LinkCheckResult
+from .utils import unique_preserve_order
+
+
+class LinkChecker:
+    def __init__(self, timeout: float = 10.0) -> None:
+        self._timeout = timeout
+        self._client = httpx.Client(
+            follow_redirects=True,
+            timeout=httpx.Timeout(timeout),
+            headers={"User-Agent": "mdlink/0.1"},
+        )
+
+    def close(self) -> None:
+        self._client.close()
+
+    def __enter__(self) -> "LinkChecker":
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self.close()
+
+    def check(self, url: str) -> LinkCheckResult:
+        try:
+            response = self._client.get(url)
+            final_url = str(response.url)
+            redirected = final_url != url
+            return LinkCheckResult(
+                original_url=url,
+                status_code=response.status_code,
+                final_url=final_url,
+                redirected=redirected,
+                error=None,
+            )
+        except httpx.HTTPError as exc:
+            return LinkCheckResult(
+                original_url=url,
+                status_code=None,
+                final_url=None,
+                redirected=False,
+                error=str(exc),
+            )
+
+    def check_many(self, urls: Iterable[str]) -> dict[str, LinkCheckResult]:
+        results: dict[str, LinkCheckResult] = {}
+        for url in unique_preserve_order(urls):
+            results[url] = self.check(url)
+        return results
@@ -0,0 +1,127 @@
+from __future__ import annotations
+
+import argparse
+from collections import defaultdict
+from pathlib import Path
+
+from rich.console import Console
+from rich.table import Table
+
+from .ast_editor import ASTMarkdownEditor
+from .checker import LinkChecker
+from .models import LinkCheckResult, LinkRecord
+from .scanner import MarkdownScanner
+from .utils import unique_preserve_order
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
+    parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
+    parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
+    return parser.parse_args()
+
+
+def _build_report_table(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> Table:
+    table = Table(title="Non-200 Links")
+    table.add_column("file")
+    table.add_column("line", justify="right")
+    table.add_column("original URL")
+    table.add_column("status")
+    table.add_column("final URL")
+
+    for record in records:
+        result = checks[record.url]
+        if not result.should_report:
+            continue
+        status_value = str(result.status_code) if result.status_code is not None else f"ERR: {result.error}"
+        table.add_row(
+            str(record.file_path),
+            str(record.line),
+            record.url,
+            status_value,
+            result.final_url or "",
+        )
+    return table
+
+
+def _collect_redirects(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> list[tuple[LinkRecord, LinkCheckResult]]:
+    redirects: list[tuple[LinkRecord, LinkCheckResult]] = []
+    for record in records:
+        result = checks[record.url]
+        if not result.redirected:
+            continue
+        if not result.final_url:
+            continue
+        redirects.append((record, result))
+    return redirects
+
+
+def _handle_rewrites(
+    redirects: list[tuple[LinkRecord, LinkCheckResult]],
+    checker: LinkChecker,
+    editor: ASTMarkdownEditor,
+    console: Console,
+) -> None:
+    replacements_by_file: dict[Path, dict[str, str]] = defaultdict(dict)
+    seen_pairs: set[tuple[Path, str, str]] = set()
+
+    for record, result in redirects:
+        if record.kind != "markdown":
+            continue
+        final_url = result.final_url
+        if not final_url:
+            continue
+        pair = (record.file_path, record.url, final_url)
+        if pair in seen_pairs:
+            continue
+        seen_pairs.add(pair)
+
+        console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
+        console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
+        answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower()
+        if answer != "y":
+            continue
+
+        verification = checker.check(final_url)
+        if verification.status_code != 200:
+            console.print(
+                f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})"
+            )
+            continue
+        replacements_by_file[record.file_path][record.url] = final_url
+
+    for file_path, replacements in replacements_by_file.items():
+        content = file_path.read_text(encoding="utf-8")
+        updated = editor.replace_links(content, replacements)
+        if updated != content:
+            file_path.write_text(updated, encoding="utf-8")
+            console.print(f"[green]Updated[/green] {file_path}")
+
+
+def main() -> None:
+    args = parse_args()
+    console = Console()
+
+    scanner = MarkdownScanner()
+    records = scanner.scan_path(args.path)
+    if not records:
+        console.print("No links found.")
+        return
+
+    urls = unique_preserve_order(record.url for record in records)
+    with LinkChecker(timeout=args.timeout) as checker:
+        checks = checker.check_many(urls)
+        table = _build_report_table(records, checks)
+        if table.row_count:
+            console.print(table)
+        else:
+            console.print("No non-200 links found.")
+
+        redirects = _collect_redirects(records, checks)
+        if redirects:
+            editor = ASTMarkdownEditor()
+            _handle_rewrites(redirects=redirects, checker=checker, editor=editor, console=console)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass(frozen=True)
+class LinkRecord:
+    file_path: Path
+    line: int
+    url: str
+    kind: str
+
+
+@dataclass(frozen=True)
+class LinkCheckResult:
+    original_url: str
+    status_code: Optional[int]
+    final_url: Optional[str]
+    redirected: bool
+    error: Optional[str] = None
+
+    @property
+    def should_report(self) -> bool:
+        if self.error is not None:
+            return True
+        return self.status_code != 200
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Iterable
+
+from markdown_it import MarkdownIt
+from markdown_it.token import Token
+
+from .models import LinkRecord
+from .utils import is_http_url, iter_markdown_files
+
+NAKED_URL_RE = re.compile(r"https?://[^\s<>()\[\]{}\"']+")
+
+
+class MarkdownScanner:
+    def __init__(self) -> None:
+        self._md = MarkdownIt("commonmark")
+
+    def scan_path(self, target: Path) -> list[LinkRecord]:
+        records: list[LinkRecord] = []
+        for file_path in iter_markdown_files(target):
+            content = file_path.read_text(encoding="utf-8")
+            records.extend(self.scan_content(file_path=file_path, content=content))
+        return records
+
+    def scan_content(self, file_path: Path, content: str) -> list[LinkRecord]:
+        tokens = self._md.parse(content)
+        found: list[LinkRecord] = []
+        for token in tokens:
+            if token.type != "inline" or not token.children:
+                continue
+            line = self._line_from_token(token)
+            found.extend(self._extract_from_inline(file_path=file_path, line=line, children=token.children))
+        return found
+
+    def _extract_from_inline(self, file_path: Path, line: int, children: Iterable[Token]) -> list[LinkRecord]:
+        records: list[LinkRecord] = []
+        in_link_depth = 0
+        for child in children:
+            if child.type == "link_open":
+                in_link_depth += 1
+                href = child.attrGet("href")
+                if href and is_http_url(href):
+                    records.append(LinkRecord(file_path=file_path, line=line, url=href, kind="markdown"))
+                continue
+            if child.type == "link_close":
+                in_link_depth = max(0, in_link_depth - 1)
+                continue
+            if child.type == "image":
+                continue
+            if child.type == "text" and in_link_depth == 0:
+                for match in NAKED_URL_RE.finditer(child.content):
+                    url = match.group(0).rstrip(".,;:!?")
+                    if is_http_url(url):
+                        records.append(LinkRecord(file_path=file_path, line=line, url=url, kind="naked"))
+        return records
+
+    @staticmethod
+    def _line_from_token(token: Token) -> int:
+        if token.map and token.map[0] is not None:
+            return int(token.map[0]) + 1
+        return 1
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Iterable, Iterator
+from urllib.parse import urlparse
+
+
+MARKDOWN_EXTENSIONS = {".md"}
+
+
+def iter_markdown_files(target: Path) -> Iterator[Path]:
+    if target.is_file():
+        if target.suffix.lower() in MARKDOWN_EXTENSIONS:
+            yield target
+        return
+    for path in sorted(target.rglob("*.md")):
+        if path.is_file():
+            yield path
+
+
+def is_http_url(url: str) -> bool:
+    parsed = urlparse(url)
+    return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
+
+
+def unique_preserve_order(values: Iterable[str]) -> list[str]:
+    seen: set[str] = set()
+    result: list[str] = []
+    for value in values:
+        if value in seen:
+            continue
+        seen.add(value)
+        result.append(value)
+    return result
@@ -0,0 +1,21 @@
+[project]
+name = "mdlink"
+version = "0.1.0"
+description = "CLI tool to validate Markdown links and optionally rewrite redirects."
+readme = "README.md"
+authors = [
+    {name = "drg", email = "gammlaa@chaospott.de"}
+]
+license = "MIT"
+requires-python = ">=3.9"
+dependencies = [
+    "httpx>=0.27.0",
+    "markdown-it-py>=3.0.0",
+    "rich>=13.7.0"
+]
+
+[project.scripts]
+mdlink = "mdlink.cli:main"
+
+[tool.setuptools.packages.find]
+include = ["mdlink*"]
@@ -0,0 +1,22 @@
+# mdlink Test
+
+- [ok](https://httpbin.org/status/200)
+- [redirect](https://github.com/)
+- [broken](https://httpbin.org/status/404)
+- Naked: https://httpbin.org/status/500
+- ![img](https://httpbin.org/image/png)
+
+## Additional Cases
+
+- [redirect with title](https://github.com/ "GitHub redirect")
+- [query and fragment](https://example.com/docs?lang=de#intro)
+- [duplicate redirect](https://github.com/)
+- [duplicate redirect again](https://github.com/)
+- [non-http scheme should be ignored](mailto:team@example.org)
+- [ftp should be ignored](ftp://speedtest.tele2.net)
+- [inline code URL should not be a markdown link](`https://example.org/code`)
+- [image in text should be ignored] text before ![logo](www.uph.de) text after
+- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
+- [trailing punctuation in sentence] See https://example.org/docs, for details.
+- autolink angle brackets: <https://example.org/autolink>
+- bare www should be ignored: www.example.org
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+OUTPUT_FILE="${1:-test.md}"
+
+cat > "${OUTPUT_FILE}" <<'EOF'
+# mdlink Test
+
+- [ok](https://httpbin.org/status/200)
+- [redirect](http://github.com)
+- [broken](https://httpbin.org/status/404)
+- Naked: https://httpbin.org/status/500
+- ![img](https://httpbin.org/image/png)
+
+## Additional Cases
+
+- [redirect with title](http://github.com "GitHub redirect")
+- [query and fragment](https://example.com/docs?lang=de#intro)
+- [duplicate redirect](http://github.com)
+- [duplicate redirect again](http://github.com)
+- [non-http scheme should be ignored](mailto:team@example.org)
+- [ftp should be ignored](ftp://speedtest.tele2.net)
+- [inline code URL should not be a markdown link](`https://example.org/code`)
+- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after
+- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
+- [trailing punctuation in sentence] See https://example.org/docs, for details.
+- autolink angle brackets: <https://example.org/autolink>
+- bare www should be ignored: www.example.org
+EOF
+
+echo "Generated ${OUTPUT_FILE}"