From f4fa4fc35ec0b52415ca342bc71a21a1007a5425 Mon Sep 17 00:00:00 2001 From: drg Date: Fri, 10 Apr 2026 15:24:42 +0200 Subject: [PATCH] init mdlink --- .gitignore | 43 +++++++++ README.md | 52 +++++++++++ mdlink/__init__.py | 5 + mdlink/ast_editor.py | 217 +++++++++++++++++++++++++++++++++++++++++++ mdlink/checker.py | 54 +++++++++++ mdlink/cli.py | 127 +++++++++++++++++++++++++ mdlink/models.py | 28 ++++++ mdlink/scanner.py | 63 +++++++++++++ mdlink/utils.py | 34 +++++++ pyproject.toml | 21 +++++ test.md | 22 +++++ test.sh | 31 +++++++ 12 files changed, 697 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 mdlink/__init__.py create mode 100644 mdlink/ast_editor.py create mode 100644 mdlink/checker.py create mode 100644 mdlink/cli.py create mode 100644 mdlink/models.py create mode 100644 mdlink/scanner.py create mode 100644 mdlink/utils.py create mode 100644 pyproject.toml create mode 100644 test.md create mode 100755 test.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1e06c28 --- /dev/null +++ b/.gitignore @@ -0,0 +1,43 @@ +# Python bytecode and caches +__pycache__/ +*.py[cod] +*$py.class + +# Virtual environments +.venv/ +venv/ +env/ +ENV/ + +# Build and packaging artifacts +build/ +dist/ +*.egg-info/ +.eggs/ +pip-wheel-metadata/ +*.whl + +# Test and coverage outputs +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +.coverage.* +htmlcov/ +.tox/ +.nox/ + +# Local tooling/runtime files +.DS_Store +Thumbs.db +*.log +.codex + +# IDE/editor settings +.idea/ +.vscode/ +*.swp +*.swo + +# Local test artifacts +test_preview.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..3b141ea --- /dev/null +++ b/README.md @@ -0,0 +1,52 @@ +# mdlink + +`mdlink` is a CLI tool to recursively scan Markdown files and validate HTTP/HTTPS links. + +## Features + +- Scans `.md` files recursively in a directory (or a single `.md` file). +- Extracts Markdown links (`[text](url)`) and naked URLs (`https://...`). +- Ignores image links (`![alt](url)`). +- Checks links with `httpx` and follows redirects automatically. +- Ignores `200 OK` in the report; shows non-200 and request errors. +- Shows final resolved URL in output. +- Asks interactively before rewriting redirected Markdown links. +- Re-checks target URL before writing (must still return `200`). +- Rewrites links via Markdown AST editing (`markdown-it-py`), not string replacement. + +## Installation + +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install . +``` + +## Usage + +```bash +mdlink /path/to/docs +``` + +## Options + +- `--timeout FLOAT` + Per-request timeout in seconds (default: `10.0`). + +## Interactive Redirect Rewrite + +When a Markdown link redirects, `mdlink` prompts: + +```text +Replace old URL with final URL? [y/N] +``` + +- [broken](https://httpbin.org/status/404) + +Only confirmed links are updated. + +## Help + +```bash +mdlink --help +``` diff --git a/mdlink/__init__.py b/mdlink/__init__.py new file mode 100644 index 0000000..0caa843 --- /dev/null +++ b/mdlink/__init__.py @@ -0,0 +1,5 @@ +"""mdlink package.""" + +from .cli import main + +__all__ = ["main"] diff --git a/mdlink/ast_editor.py b/mdlink/ast_editor.py new file mode 100644 index 0000000..d2fe988 --- /dev/null +++ b/mdlink/ast_editor.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +from collections import defaultdict +from dataclasses import dataclass +from typing import DefaultDict + +from markdown_it import MarkdownIt +from markdown_it.token import Token + + +@dataclass(frozen=True) +class _LinePatch: + old: str + new: str + + +class ASTMarkdownEditor: + def __init__(self) -> None: + self._md = MarkdownIt("commonmark") + + def replace_links(self, content: str, replacements: dict[str, str]) -> str: + if not replacements: + return content + + tokens = self._md.parse(content) + patches_by_line = self._collect_line_patches(tokens=tokens, replacements=replacements) + if not patches_by_line: + return content + + lines = content.splitlines(keepends=True) + for line_index, patches in patches_by_line.items(): + if line_index < 0 or line_index >= len(lines): + continue + lines[line_index] = self._rewrite_markdown_links_in_line(lines[line_index], patches) + return "".join(lines) + + def _collect_line_patches( + self, + tokens: list[Token], + replacements: dict[str, str], + ) -> dict[int, list[_LinePatch]]: + patches_by_line: DefaultDict[int, list[_LinePatch]] = defaultdict(list) + for token in tokens: + if token.type != "inline" or not token.children: + continue + if not token.map: + continue + line_index = int(token.map[0]) + for child in token.children: + if child.type != "link_open": + continue + href = child.attrGet("href") + if not href: + continue + new_href = replacements.get(href) + if not new_href or new_href == href: + continue + patches_by_line[line_index].append(_LinePatch(old=href, new=new_href)) + return dict(patches_by_line) + + def _rewrite_markdown_links_in_line(self, line: str, patches: list[_LinePatch]) -> str: + if not patches: + return line + + patch_index = 0 + chars = list(line) + i = 0 + in_code = False + code_ticks = 0 + + while i < len(chars) and patch_index < len(patches): + char = chars[i] + + if char == "`" and not self._is_escaped(chars, i): + run = self._count_char_run(chars, i, "`") + if not in_code: + in_code = True + code_ticks = run + elif run == code_ticks: + in_code = False + code_ticks = 0 + i += run + continue + + if in_code: + i += 1 + continue + + if char == "[" and not self._is_escaped(chars, i): + if i > 0 and chars[i - 1] == "!": + i += 1 + continue + parsed = self._parse_inline_link(chars, i) + if parsed is None: + i += 1 + continue + start_url, end_url, parsed_url, close_index = parsed + patch = patches[patch_index] + if parsed_url == patch.old: + replacement = list(patch.new) + chars[start_url:end_url] = replacement + delta = len(replacement) - (end_url - start_url) + close_index += delta + patch_index += 1 + i = close_index + 1 + continue + + i += 1 + + return "".join(chars) + + def _parse_inline_link(self, chars: list[str], open_bracket: int) -> tuple[int, int, str, int] | None: + close_bracket = self._find_link_text_end(chars, open_bracket) + if close_bracket is None: + return None + + cursor = close_bracket + 1 + while cursor < len(chars) and chars[cursor] in (" ", "\t"): + cursor += 1 + if cursor >= len(chars) or chars[cursor] != "(": + return None + + close_paren = self._find_matching_paren(chars, cursor) + if close_paren is None: + return None + + dest_start = cursor + 1 + while dest_start < close_paren and chars[dest_start] in (" ", "\t"): + dest_start += 1 + if dest_start >= close_paren: + return None + + if chars[dest_start] == "<": + dest_end = dest_start + 1 + while dest_end < close_paren and chars[dest_end] != ">": + dest_end += 1 + if dest_end >= close_paren: + return None + url_start = dest_start + 1 + url_end = dest_end + else: + url_start = dest_start + url_end = self._scan_destination_end(chars, start=dest_start, stop=close_paren) + if url_end <= url_start: + return None + + parsed_url = "".join(chars[url_start:url_end]) + return url_start, url_end, parsed_url, close_paren + + def _find_link_text_end(self, chars: list[str], open_bracket: int) -> int | None: + depth = 1 + index = open_bracket + 1 + while index < len(chars): + char = chars[index] + if char == "[" and not self._is_escaped(chars, index): + depth += 1 + elif char == "]" and not self._is_escaped(chars, index): + depth -= 1 + if depth == 0: + return index + index += 1 + return None + + def _find_matching_paren(self, chars: list[str], open_paren: int) -> int | None: + depth = 1 + index = open_paren + 1 + in_quote: str | None = None + while index < len(chars): + char = chars[index] + if in_quote is not None: + if char == in_quote and not self._is_escaped(chars, index): + in_quote = None + index += 1 + continue + if char in ('"', "'") and not self._is_escaped(chars, index): + in_quote = char + index += 1 + continue + if char == "(" and not self._is_escaped(chars, index): + depth += 1 + elif char == ")" and not self._is_escaped(chars, index): + depth -= 1 + if depth == 0: + return index + index += 1 + return None + + def _scan_destination_end(self, chars: list[str], start: int, stop: int) -> int: + depth = 0 + index = start + while index < stop: + char = chars[index] + if char in (" ", "\t"): + if depth == 0: + break + elif char == "(" and not self._is_escaped(chars, index): + depth += 1 + elif char == ")" and not self._is_escaped(chars, index): + if depth == 0: + break + depth -= 1 + index += 1 + return index + + def _is_escaped(self, chars: list[str], pos: int) -> bool: + backslashes = 0 + index = pos - 1 + while index >= 0 and chars[index] == "\\": + backslashes += 1 + index -= 1 + return (backslashes % 2) == 1 + + def _count_char_run(self, chars: list[str], start: int, char: str) -> int: + end = start + while end < len(chars) and chars[end] == char: + end += 1 + return end - start diff --git a/mdlink/checker.py b/mdlink/checker.py new file mode 100644 index 0000000..0af97f9 --- /dev/null +++ b/mdlink/checker.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from typing import Iterable + +import httpx + +from .models import LinkCheckResult +from .utils import unique_preserve_order + + +class LinkChecker: + def __init__(self, timeout: float = 10.0) -> None: + self._timeout = timeout + self._client = httpx.Client( + follow_redirects=True, + timeout=httpx.Timeout(timeout), + headers={"User-Agent": "mdlink/0.1"}, + ) + + def close(self) -> None: + self._client.close() + + def __enter__(self) -> "LinkChecker": + return self + + def __exit__(self, exc_type, exc, tb) -> None: + self.close() + + def check(self, url: str) -> LinkCheckResult: + try: + response = self._client.get(url) + final_url = str(response.url) + redirected = final_url != url + return LinkCheckResult( + original_url=url, + status_code=response.status_code, + final_url=final_url, + redirected=redirected, + error=None, + ) + except httpx.HTTPError as exc: + return LinkCheckResult( + original_url=url, + status_code=None, + final_url=None, + redirected=False, + error=str(exc), + ) + + def check_many(self, urls: Iterable[str]) -> dict[str, LinkCheckResult]: + results: dict[str, LinkCheckResult] = {} + for url in unique_preserve_order(urls): + results[url] = self.check(url) + return results diff --git a/mdlink/cli.py b/mdlink/cli.py new file mode 100644 index 0000000..42f404e --- /dev/null +++ b/mdlink/cli.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +import argparse +from collections import defaultdict +from pathlib import Path + +from rich.console import Console +from rich.table import Table + +from .ast_editor import ASTMarkdownEditor +from .checker import LinkChecker +from .models import LinkCheckResult, LinkRecord +from .scanner import MarkdownScanner +from .utils import unique_preserve_order + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.") + parser.add_argument("path", type=Path, help="Directory or Markdown file to scan") + parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds") + return parser.parse_args() + + +def _build_report_table(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> Table: + table = Table(title="Non-200 Links") + table.add_column("file") + table.add_column("line", justify="right") + table.add_column("original URL") + table.add_column("status") + table.add_column("final URL") + + for record in records: + result = checks[record.url] + if not result.should_report: + continue + status_value = str(result.status_code) if result.status_code is not None else f"ERR: {result.error}" + table.add_row( + str(record.file_path), + str(record.line), + record.url, + status_value, + result.final_url or "", + ) + return table + + +def _collect_redirects(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> list[tuple[LinkRecord, LinkCheckResult]]: + redirects: list[tuple[LinkRecord, LinkCheckResult]] = [] + for record in records: + result = checks[record.url] + if not result.redirected: + continue + if not result.final_url: + continue + redirects.append((record, result)) + return redirects + + +def _handle_rewrites( + redirects: list[tuple[LinkRecord, LinkCheckResult]], + checker: LinkChecker, + editor: ASTMarkdownEditor, + console: Console, +) -> None: + replacements_by_file: dict[Path, dict[str, str]] = defaultdict(dict) + seen_pairs: set[tuple[Path, str, str]] = set() + + for record, result in redirects: + if record.kind != "markdown": + continue + final_url = result.final_url + if not final_url: + continue + pair = (record.file_path, record.url, final_url) + if pair in seen_pairs: + continue + seen_pairs.add(pair) + + console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") + console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") + answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower() + if answer != "y": + continue + + verification = checker.check(final_url) + if verification.status_code != 200: + console.print( + f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})" + ) + continue + replacements_by_file[record.file_path][record.url] = final_url + + for file_path, replacements in replacements_by_file.items(): + content = file_path.read_text(encoding="utf-8") + updated = editor.replace_links(content, replacements) + if updated != content: + file_path.write_text(updated, encoding="utf-8") + console.print(f"[green]Updated[/green] {file_path}") + + +def main() -> None: + args = parse_args() + console = Console() + + scanner = MarkdownScanner() + records = scanner.scan_path(args.path) + if not records: + console.print("No links found.") + return + + urls = unique_preserve_order(record.url for record in records) + with LinkChecker(timeout=args.timeout) as checker: + checks = checker.check_many(urls) + table = _build_report_table(records, checks) + if table.row_count: + console.print(table) + else: + console.print("No non-200 links found.") + + redirects = _collect_redirects(records, checks) + if redirects: + editor = ASTMarkdownEditor() + _handle_rewrites(redirects=redirects, checker=checker, editor=editor, console=console) + + +if __name__ == "__main__": + main() diff --git a/mdlink/models.py b/mdlink/models.py new file mode 100644 index 0000000..dac484b --- /dev/null +++ b/mdlink/models.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + + +@dataclass(frozen=True) +class LinkRecord: + file_path: Path + line: int + url: str + kind: str + + +@dataclass(frozen=True) +class LinkCheckResult: + original_url: str + status_code: Optional[int] + final_url: Optional[str] + redirected: bool + error: Optional[str] = None + + @property + def should_report(self) -> bool: + if self.error is not None: + return True + return self.status_code != 200 diff --git a/mdlink/scanner.py b/mdlink/scanner.py new file mode 100644 index 0000000..55f59e1 --- /dev/null +++ b/mdlink/scanner.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import re +from pathlib import Path +from typing import Iterable + +from markdown_it import MarkdownIt +from markdown_it.token import Token + +from .models import LinkRecord +from .utils import is_http_url, iter_markdown_files + +NAKED_URL_RE = re.compile(r"https?://[^\s<>()\[\]{}\"']+") + + +class MarkdownScanner: + def __init__(self) -> None: + self._md = MarkdownIt("commonmark") + + def scan_path(self, target: Path) -> list[LinkRecord]: + records: list[LinkRecord] = [] + for file_path in iter_markdown_files(target): + content = file_path.read_text(encoding="utf-8") + records.extend(self.scan_content(file_path=file_path, content=content)) + return records + + def scan_content(self, file_path: Path, content: str) -> list[LinkRecord]: + tokens = self._md.parse(content) + found: list[LinkRecord] = [] + for token in tokens: + if token.type != "inline" or not token.children: + continue + line = self._line_from_token(token) + found.extend(self._extract_from_inline(file_path=file_path, line=line, children=token.children)) + return found + + def _extract_from_inline(self, file_path: Path, line: int, children: Iterable[Token]) -> list[LinkRecord]: + records: list[LinkRecord] = [] + in_link_depth = 0 + for child in children: + if child.type == "link_open": + in_link_depth += 1 + href = child.attrGet("href") + if href and is_http_url(href): + records.append(LinkRecord(file_path=file_path, line=line, url=href, kind="markdown")) + continue + if child.type == "link_close": + in_link_depth = max(0, in_link_depth - 1) + continue + if child.type == "image": + continue + if child.type == "text" and in_link_depth == 0: + for match in NAKED_URL_RE.finditer(child.content): + url = match.group(0).rstrip(".,;:!?") + if is_http_url(url): + records.append(LinkRecord(file_path=file_path, line=line, url=url, kind="naked")) + return records + + @staticmethod + def _line_from_token(token: Token) -> int: + if token.map and token.map[0] is not None: + return int(token.map[0]) + 1 + return 1 diff --git a/mdlink/utils.py b/mdlink/utils.py new file mode 100644 index 0000000..8c6b11d --- /dev/null +++ b/mdlink/utils.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Iterable, Iterator +from urllib.parse import urlparse + + +MARKDOWN_EXTENSIONS = {".md"} + + +def iter_markdown_files(target: Path) -> Iterator[Path]: + if target.is_file(): + if target.suffix.lower() in MARKDOWN_EXTENSIONS: + yield target + return + for path in sorted(target.rglob("*.md")): + if path.is_file(): + yield path + + +def is_http_url(url: str) -> bool: + parsed = urlparse(url) + return parsed.scheme in {"http", "https"} and bool(parsed.netloc) + + +def unique_preserve_order(values: Iterable[str]) -> list[str]: + seen: set[str] = set() + result: list[str] = [] + for value in values: + if value in seen: + continue + seen.add(value) + result.append(value) + return result diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9cc18a8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[project] +name = "mdlink" +version = "0.1.0" +description = "CLI tool to validate Markdown links and optionally rewrite redirects." +readme = "README.md" +authors = [ + {name = "drg", email = "gammlaa@chaospott.de"} +] +license = "MIT" +requires-python = ">=3.9" +dependencies = [ + "httpx>=0.27.0", + "markdown-it-py>=3.0.0", + "rich>=13.7.0" +] + +[project.scripts] +mdlink = "mdlink.cli:main" + +[tool.setuptools.packages.find] +include = ["mdlink*"] diff --git a/test.md b/test.md new file mode 100644 index 0000000..8711be4 --- /dev/null +++ b/test.md @@ -0,0 +1,22 @@ +# mdlink Test + +- [ok](https://httpbin.org/status/200) +- [redirect](https://github.com/) +- [broken](https://httpbin.org/status/404) +- Naked: https://httpbin.org/status/500 +- ![img](https://httpbin.org/image/png) + +## Additional Cases + +- [redirect with title](https://github.com/ "GitHub redirect") +- [query and fragment](https://example.com/docs?lang=de#intro) +- [duplicate redirect](https://github.com/) +- [duplicate redirect again](https://github.com/) +- [non-http scheme should be ignored](mailto:team@example.org) +- [ftp should be ignored](ftp://speedtest.tele2.net) +- [inline code URL should not be a markdown link](`https://example.org/code`) +- [image in text should be ignored] text before ![logo](www.uph.de) text after +- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics)) +- [trailing punctuation in sentence] See https://example.org/docs, for details. +- autolink angle brackets: +- bare www should be ignored: www.example.org diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..7793392 --- /dev/null +++ b/test.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +OUTPUT_FILE="${1:-test.md}" + +cat > "${OUTPUT_FILE}" <<'EOF' +# mdlink Test + +- [ok](https://httpbin.org/status/200) +- [redirect](http://github.com) +- [broken](https://httpbin.org/status/404) +- Naked: https://httpbin.org/status/500 +- ![img](https://httpbin.org/image/png) + +## Additional Cases + +- [redirect with title](http://github.com "GitHub redirect") +- [query and fragment](https://example.com/docs?lang=de#intro) +- [duplicate redirect](http://github.com) +- [duplicate redirect again](http://github.com) +- [non-http scheme should be ignored](mailto:team@example.org) +- [ftp should be ignored](ftp://speedtest.tele2.net) +- [inline code URL should not be a markdown link](`https://example.org/code`) +- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after +- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics)) +- [trailing punctuation in sentence] See https://example.org/docs, for details. +- autolink angle brackets: +- bare www should be ignored: www.example.org +EOF + +echo "Generated ${OUTPUT_FILE}"