init mdlink

2026-04-10 15:24:42 +02:00
commit f4fa4fc35e
12 changed files with 697 additions and 0 deletions
@@ -0,0 +1,43 @@
 # Python bytecode and caches
 __pycache__/
 *.py[cod]
 *$py.class
 # Virtual environments
 .venv/
 venv/
 env/
 ENV/
 # Build and packaging artifacts
 build/
 dist/
 *.egg-info/
 .eggs/
 pip-wheel-metadata/
 *.whl
 # Test and coverage outputs
 .pytest_cache/
 .mypy_cache/
 .ruff_cache/
 .coverage
 .coverage.*
 htmlcov/
 .tox/
 .nox/
 # Local tooling/runtime files
 .DS_Store
 Thumbs.db
 *.log
 .codex
 # IDE/editor settings
 .idea/
 .vscode/
 *.swp
 *.swo
 # Local test artifacts
 test_preview.md
@@ -0,0 +1,52 @@
 # mdlink
 `mdlink` is a CLI tool to recursively scan Markdown files and validate HTTP/HTTPS links.
 ## Features
 - Scans `.md` files recursively in a directory (or a single `.md` file).
 - Extracts Markdown links (`[text](url)`) and naked URLs (`https://...`).
 - Ignores image links (`![alt](url)`).
 - Checks links with `httpx` and follows redirects automatically.
 - Ignores `200 OK` in the report; shows non-200 and request errors.
 - Shows final resolved URL in output.
 - Asks interactively before rewriting redirected Markdown links.
 - Re-checks target URL before writing (must still return `200`).
 - Rewrites links via Markdown AST editing (`markdown-it-py`), not string replacement.
 ## Installation
 ```bash
 python3 -m venv .venv
 source .venv/bin/activate
 pip install .
 ```
 ## Usage
 ```bash
 mdlink /path/to/docs
 ```
 ## Options
 - `--timeout FLOAT`  
  Per-request timeout in seconds (default: `10.0`).
 ## Interactive Redirect Rewrite
 When a Markdown link redirects, `mdlink` prompts:
 ```text
 Replace old URL with final URL? [y/N]
 ```
 - [broken](https://httpbin.org/status/404)
 Only confirmed links are updated.
 ## Help
 ```bash
 mdlink --help
 ```
@@ -0,0 +1,5 @@
 """mdlink package."""
 from .cli import main
 __all__ = ["main"]
@@ -0,0 +1,217 @@
 from __future__ import annotations
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import DefaultDict
 from markdown_it import MarkdownIt
 from markdown_it.token import Token
@dataclass(frozen=True)
 class _LinePatch:
    old: str
    new: str
 class ASTMarkdownEditor:
    def __init__(self) -> None:
        self._md = MarkdownIt("commonmark")
    def replace_links(self, content: str, replacements: dict[str, str]) -> str:
        if not replacements:
            return content
        tokens = self._md.parse(content)
        patches_by_line = self._collect_line_patches(tokens=tokens, replacements=replacements)
        if not patches_by_line:
            return content
        lines = content.splitlines(keepends=True)
        for line_index, patches in patches_by_line.items():
            if line_index < 0 or line_index >= len(lines):
                continue
            lines[line_index] = self._rewrite_markdown_links_in_line(lines[line_index], patches)
        return "".join(lines)
    def _collect_line_patches(
        self,
        tokens: list[Token],
        replacements: dict[str, str],
    ) -> dict[int, list[_LinePatch]]:
        patches_by_line: DefaultDict[int, list[_LinePatch]] = defaultdict(list)
        for token in tokens:
            if token.type != "inline" or not token.children:
                continue
            if not token.map:
                continue
            line_index = int(token.map[0])
            for child in token.children:
                if child.type != "link_open":
                    continue
                href = child.attrGet("href")
                if not href:
                    continue
                new_href = replacements.get(href)
                if not new_href or new_href == href:
                    continue
                patches_by_line[line_index].append(_LinePatch(old=href, new=new_href))
        return dict(patches_by_line)
    def _rewrite_markdown_links_in_line(self, line: str, patches: list[_LinePatch]) -> str:
        if not patches:
            return line
        patch_index = 0
        chars = list(line)
        i = 0
        in_code = False
        code_ticks = 0
        while i < len(chars) and patch_index < len(patches):
            char = chars[i]
            if char == "`" and not self._is_escaped(chars, i):
                run = self._count_char_run(chars, i, "`")
                if not in_code:
                    in_code = True
                    code_ticks = run
                elif run == code_ticks:
                    in_code = False
                    code_ticks = 0
                i += run
                continue
            if in_code:
                i += 1
                continue
            if char == "[" and not self._is_escaped(chars, i):
                if i > 0 and chars[i - 1] == "!":
                    i += 1
                    continue
                parsed = self._parse_inline_link(chars, i)
                if parsed is None:
                    i += 1
                    continue
                start_url, end_url, parsed_url, close_index = parsed
                patch = patches[patch_index]
                if parsed_url == patch.old:
                    replacement = list(patch.new)
                    chars[start_url:end_url] = replacement
                    delta = len(replacement) - (end_url - start_url)
                    close_index += delta
                    patch_index += 1
                i = close_index + 1
                continue
            i += 1
        return "".join(chars)
    def _parse_inline_link(self, chars: list[str], open_bracket: int) -> tuple[int, int, str, int] | None:
        close_bracket = self._find_link_text_end(chars, open_bracket)
        if close_bracket is None:
            return None
        cursor = close_bracket + 1
        while cursor < len(chars) and chars[cursor] in (" ", "\t"):
            cursor += 1
        if cursor >= len(chars) or chars[cursor] != "(":
            return None
        close_paren = self._find_matching_paren(chars, cursor)
        if close_paren is None:
            return None
        dest_start = cursor + 1
        while dest_start < close_paren and chars[dest_start] in (" ", "\t"):
            dest_start += 1
        if dest_start >= close_paren:
            return None
        if chars[dest_start] == "<":
            dest_end = dest_start + 1
            while dest_end < close_paren and chars[dest_end] != ">":
                dest_end += 1
            if dest_end >= close_paren:
                return None
            url_start = dest_start + 1
            url_end = dest_end
        else:
            url_start = dest_start
            url_end = self._scan_destination_end(chars, start=dest_start, stop=close_paren)
            if url_end <= url_start:
                return None
        parsed_url = "".join(chars[url_start:url_end])
        return url_start, url_end, parsed_url, close_paren
    def _find_link_text_end(self, chars: list[str], open_bracket: int) -> int | None:
        depth = 1
        index = open_bracket + 1
        while index < len(chars):
            char = chars[index]
            if char == "[" and not self._is_escaped(chars, index):
                depth += 1
            elif char == "]" and not self._is_escaped(chars, index):
                depth -= 1
                if depth == 0:
                    return index
            index += 1
        return None
    def _find_matching_paren(self, chars: list[str], open_paren: int) -> int | None:
        depth = 1
        index = open_paren + 1
        in_quote: str | None = None
        while index < len(chars):
            char = chars[index]
            if in_quote is not None:
                if char == in_quote and not self._is_escaped(chars, index):
                    in_quote = None
                index += 1
                continue
            if char in ('"', "'") and not self._is_escaped(chars, index):
                in_quote = char
                index += 1
                continue
            if char == "(" and not self._is_escaped(chars, index):
                depth += 1
            elif char == ")" and not self._is_escaped(chars, index):
                depth -= 1
                if depth == 0:
                    return index
            index += 1
        return None
    def _scan_destination_end(self, chars: list[str], start: int, stop: int) -> int:
        depth = 0
        index = start
        while index < stop:
            char = chars[index]
            if char in (" ", "\t"):
                if depth == 0:
                    break
            elif char == "(" and not self._is_escaped(chars, index):
                depth += 1
            elif char == ")" and not self._is_escaped(chars, index):
                if depth == 0:
                    break
                depth -= 1
            index += 1
        return index
    def _is_escaped(self, chars: list[str], pos: int) -> bool:
        backslashes = 0
        index = pos - 1
        while index >= 0 and chars[index] == "\\":
            backslashes += 1
            index -= 1
        return (backslashes % 2) == 1
    def _count_char_run(self, chars: list[str], start: int, char: str) -> int:
        end = start
        while end < len(chars) and chars[end] == char:
            end += 1
        return end - start
@@ -0,0 +1,54 @@
 from __future__ import annotations
 from typing import Iterable
 import httpx
 from .models import LinkCheckResult
 from .utils import unique_preserve_order
 class LinkChecker:
    def __init__(self, timeout: float = 10.0) -> None:
        self._timeout = timeout
        self._client = httpx.Client(
            follow_redirects=True,
            timeout=httpx.Timeout(timeout),
            headers={"User-Agent": "mdlink/0.1"},
        )
    def close(self) -> None:
        self._client.close()
    def __enter__(self) -> "LinkChecker":
        return self
    def __exit__(self, exc_type, exc, tb) -> None:
        self.close()
    def check(self, url: str) -> LinkCheckResult:
        try:
            response = self._client.get(url)
            final_url = str(response.url)
            redirected = final_url != url
            return LinkCheckResult(
                original_url=url,
                status_code=response.status_code,
                final_url=final_url,
                redirected=redirected,
                error=None,
            )
        except httpx.HTTPError as exc:
            return LinkCheckResult(
                original_url=url,
                status_code=None,
                final_url=None,
                redirected=False,
                error=str(exc),
            )
    def check_many(self, urls: Iterable[str]) -> dict[str, LinkCheckResult]:
        results: dict[str, LinkCheckResult] = {}
        for url in unique_preserve_order(urls):
            results[url] = self.check(url)
        return results
@@ -0,0 +1,127 @@
 from __future__ import annotations
 import argparse
 from collections import defaultdict
 from pathlib import Path
 from rich.console import Console
 from rich.table import Table
 from .ast_editor import ASTMarkdownEditor
 from .checker import LinkChecker
 from .models import LinkCheckResult, LinkRecord
 from .scanner import MarkdownScanner
 from .utils import unique_preserve_order
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
    parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
    parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
    return parser.parse_args()
 def _build_report_table(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> Table:
    table = Table(title="Non-200 Links")
    table.add_column("file")
    table.add_column("line", justify="right")
    table.add_column("original URL")
    table.add_column("status")
    table.add_column("final URL")
    for record in records:
        result = checks[record.url]
        if not result.should_report:
            continue
        status_value = str(result.status_code) if result.status_code is not None else f"ERR: {result.error}"
        table.add_row(
            str(record.file_path),
            str(record.line),
            record.url,
            status_value,
            result.final_url or "",
        )
    return table
 def _collect_redirects(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> list[tuple[LinkRecord, LinkCheckResult]]:
    redirects: list[tuple[LinkRecord, LinkCheckResult]] = []
    for record in records:
        result = checks[record.url]
        if not result.redirected:
            continue
        if not result.final_url:
            continue
        redirects.append((record, result))
    return redirects
 def _handle_rewrites(
    redirects: list[tuple[LinkRecord, LinkCheckResult]],
    checker: LinkChecker,
    editor: ASTMarkdownEditor,
    console: Console,
 ) -> None:
    replacements_by_file: dict[Path, dict[str, str]] = defaultdict(dict)
    seen_pairs: set[tuple[Path, str, str]] = set()
    for record, result in redirects:
        if record.kind != "markdown":
            continue
        final_url = result.final_url
        if not final_url:
            continue
        pair = (record.file_path, record.url, final_url)
        if pair in seen_pairs:
            continue
        seen_pairs.add(pair)
        console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
        console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
        answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower()
        if answer != "y":
            continue
        verification = checker.check(final_url)
        if verification.status_code != 200:
            console.print(
                f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})"
            )
            continue
        replacements_by_file[record.file_path][record.url] = final_url
    for file_path, replacements in replacements_by_file.items():
        content = file_path.read_text(encoding="utf-8")
        updated = editor.replace_links(content, replacements)
        if updated != content:
            file_path.write_text(updated, encoding="utf-8")
            console.print(f"[green]Updated[/green] {file_path}")
 def main() -> None:
    args = parse_args()
    console = Console()
    scanner = MarkdownScanner()
    records = scanner.scan_path(args.path)
    if not records:
        console.print("No links found.")
        return
    urls = unique_preserve_order(record.url for record in records)
    with LinkChecker(timeout=args.timeout) as checker:
        checks = checker.check_many(urls)
        table = _build_report_table(records, checks)
        if table.row_count:
            console.print(table)
        else:
            console.print("No non-200 links found.")
        redirects = _collect_redirects(records, checks)
        if redirects:
            editor = ASTMarkdownEditor()
            _handle_rewrites(redirects=redirects, checker=checker, editor=editor, console=console)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,28 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
@dataclass(frozen=True)
 class LinkRecord:
    file_path: Path
    line: int
    url: str
    kind: str
@dataclass(frozen=True)
 class LinkCheckResult:
    original_url: str
    status_code: Optional[int]
    final_url: Optional[str]
    redirected: bool
    error: Optional[str] = None
    @property
    def should_report(self) -> bool:
        if self.error is not None:
            return True
        return self.status_code != 200
@@ -0,0 +1,63 @@
 from __future__ import annotations
 import re
 from pathlib import Path
 from typing import Iterable
 from markdown_it import MarkdownIt
 from markdown_it.token import Token
 from .models import LinkRecord
 from .utils import is_http_url, iter_markdown_files
 NAKED_URL_RE = re.compile(r"https?://[^\s<>()\[\]{}\"']+")
 class MarkdownScanner:
    def __init__(self) -> None:
        self._md = MarkdownIt("commonmark")
    def scan_path(self, target: Path) -> list[LinkRecord]:
        records: list[LinkRecord] = []
        for file_path in iter_markdown_files(target):
            content = file_path.read_text(encoding="utf-8")
            records.extend(self.scan_content(file_path=file_path, content=content))
        return records
    def scan_content(self, file_path: Path, content: str) -> list[LinkRecord]:
        tokens = self._md.parse(content)
        found: list[LinkRecord] = []
        for token in tokens:
            if token.type != "inline" or not token.children:
                continue
            line = self._line_from_token(token)
            found.extend(self._extract_from_inline(file_path=file_path, line=line, children=token.children))
        return found
    def _extract_from_inline(self, file_path: Path, line: int, children: Iterable[Token]) -> list[LinkRecord]:
        records: list[LinkRecord] = []
        in_link_depth = 0
        for child in children:
            if child.type == "link_open":
                in_link_depth += 1
                href = child.attrGet("href")
                if href and is_http_url(href):
                    records.append(LinkRecord(file_path=file_path, line=line, url=href, kind="markdown"))
                continue
            if child.type == "link_close":
                in_link_depth = max(0, in_link_depth - 1)
                continue
            if child.type == "image":
                continue
            if child.type == "text" and in_link_depth == 0:
                for match in NAKED_URL_RE.finditer(child.content):
                    url = match.group(0).rstrip(".,;:!?")
                    if is_http_url(url):
                        records.append(LinkRecord(file_path=file_path, line=line, url=url, kind="naked"))
        return records
    @staticmethod
    def _line_from_token(token: Token) -> int:
        if token.map and token.map[0] is not None:
            return int(token.map[0]) + 1
        return 1
@@ -0,0 +1,34 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Iterable, Iterator
 from urllib.parse import urlparse
 MARKDOWN_EXTENSIONS = {".md"}
 def iter_markdown_files(target: Path) -> Iterator[Path]:
    if target.is_file():
        if target.suffix.lower() in MARKDOWN_EXTENSIONS:
            yield target
        return
    for path in sorted(target.rglob("*.md")):
        if path.is_file():
            yield path
 def is_http_url(url: str) -> bool:
    parsed = urlparse(url)
    return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
 def unique_preserve_order(values: Iterable[str]) -> list[str]:
    seen: set[str] = set()
    result: list[str] = []
    for value in values:
        if value in seen:
            continue
        seen.add(value)
        result.append(value)
    return result
@@ -0,0 +1,21 @@
 [project]
 name = "mdlink"
 version = "0.1.0"
 description = "CLI tool to validate Markdown links and optionally rewrite redirects."
 readme = "README.md"
 authors = [
    {name = "drg", email = "gammlaa@chaospott.de"}
 ]
 license = "MIT"
 requires-python = ">=3.9"
 dependencies = [
    "httpx>=0.27.0",
    "markdown-it-py>=3.0.0",
    "rich>=13.7.0"
 ]
 [project.scripts]
 mdlink = "mdlink.cli:main"
 [tool.setuptools.packages.find]
 include = ["mdlink*"]
@@ -0,0 +1,22 @@
 # mdlink Test
 - [ok](https://httpbin.org/status/200)
 - [redirect](https://github.com/)
 - [broken](https://httpbin.org/status/404)
 - Naked: https://httpbin.org/status/500
 - ![img](https://httpbin.org/image/png)
 ## Additional Cases
 - [redirect with title](https://github.com/ "GitHub redirect")
 - [query and fragment](https://example.com/docs?lang=de#intro)
 - [duplicate redirect](https://github.com/)
 - [duplicate redirect again](https://github.com/)
 - [non-http scheme should be ignored](mailto:team@example.org)
 - [ftp should be ignored](ftp://speedtest.tele2.net)
 - [inline code URL should not be a markdown link](`https://example.org/code`)
 - [image in text should be ignored] text before ![logo](www.uph.de) text after
 - [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
 - [trailing punctuation in sentence] See https://example.org/docs, for details.
 - autolink angle brackets: <https://example.org/autolink>
 - bare www should be ignored: www.example.org
@@ -0,0 +1,31 @@
 #!/usr/bin/env bash
 set -euo pipefail
 OUTPUT_FILE="${1:-test.md}"
 cat > "${OUTPUT_FILE}" <<'EOF'
 # mdlink Test
 - [ok](https://httpbin.org/status/200)
 - [redirect](http://github.com)
 - [broken](https://httpbin.org/status/404)
 - Naked: https://httpbin.org/status/500
 - ![img](https://httpbin.org/image/png)
 ## Additional Cases
 - [redirect with title](http://github.com "GitHub redirect")
 - [query and fragment](https://example.com/docs?lang=de#intro)
 - [duplicate redirect](http://github.com)
 - [duplicate redirect again](http://github.com)
 - [non-http scheme should be ignored](mailto:team@example.org)
 - [ftp should be ignored](ftp://speedtest.tele2.net)
 - [inline code URL should not be a markdown link](`https://example.org/code`)
 - [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after
 - [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
 - [trailing punctuation in sentence] See https://example.org/docs, for details.
 - autolink angle brackets: <https://example.org/autolink>
 - bare www should be ignored: www.example.org
 EOF
 echo "Generated ${OUTPUT_FILE}"