init mdlink

This commit is contained in:
2026-04-10 15:24:42 +02:00
commit f4fa4fc35e
12 changed files with 697 additions and 0 deletions

43
.gitignore vendored Normal file
View File

@@ -0,0 +1,43 @@
# Python bytecode and caches
__pycache__/
*.py[cod]
*$py.class
# Virtual environments
.venv/
venv/
env/
ENV/
# Build and packaging artifacts
build/
dist/
*.egg-info/
.eggs/
pip-wheel-metadata/
*.whl
# Test and coverage outputs
.pytest_cache/
.mypy_cache/
.ruff_cache/
.coverage
.coverage.*
htmlcov/
.tox/
.nox/
# Local tooling/runtime files
.DS_Store
Thumbs.db
*.log
.codex
# IDE/editor settings
.idea/
.vscode/
*.swp
*.swo
# Local test artifacts
test_preview.md

52
README.md Normal file
View File

@@ -0,0 +1,52 @@
# mdlink
`mdlink` is a CLI tool to recursively scan Markdown files and validate HTTP/HTTPS links.
## Features
- Scans `.md` files recursively in a directory (or a single `.md` file).
- Extracts Markdown links (`[text](url)`) and naked URLs (`https://...`).
- Ignores image links (`![alt](url)`).
- Checks links with `httpx` and follows redirects automatically.
- Ignores `200 OK` in the report; shows non-200 and request errors.
- Shows final resolved URL in output.
- Asks interactively before rewriting redirected Markdown links.
- Re-checks target URL before writing (must still return `200`).
- Rewrites links via Markdown AST editing (`markdown-it-py`), not string replacement.
## Installation
```bash
python3 -m venv .venv
source .venv/bin/activate
pip install .
```
## Usage
```bash
mdlink /path/to/docs
```
## Options
- `--timeout FLOAT`
Per-request timeout in seconds (default: `10.0`).
## Interactive Redirect Rewrite
When a Markdown link redirects, `mdlink` prompts:
```text
Replace old URL with final URL? [y/N]
```
- [broken](https://httpbin.org/status/404)
Only confirmed links are updated.
## Help
```bash
mdlink --help
```

5
mdlink/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
"""mdlink package."""
from .cli import main
__all__ = ["main"]

217
mdlink/ast_editor.py Normal file
View File

@@ -0,0 +1,217 @@
from __future__ import annotations
from collections import defaultdict
from dataclasses import dataclass
from typing import DefaultDict
from markdown_it import MarkdownIt
from markdown_it.token import Token
@dataclass(frozen=True)
class _LinePatch:
old: str
new: str
class ASTMarkdownEditor:
def __init__(self) -> None:
self._md = MarkdownIt("commonmark")
def replace_links(self, content: str, replacements: dict[str, str]) -> str:
if not replacements:
return content
tokens = self._md.parse(content)
patches_by_line = self._collect_line_patches(tokens=tokens, replacements=replacements)
if not patches_by_line:
return content
lines = content.splitlines(keepends=True)
for line_index, patches in patches_by_line.items():
if line_index < 0 or line_index >= len(lines):
continue
lines[line_index] = self._rewrite_markdown_links_in_line(lines[line_index], patches)
return "".join(lines)
def _collect_line_patches(
self,
tokens: list[Token],
replacements: dict[str, str],
) -> dict[int, list[_LinePatch]]:
patches_by_line: DefaultDict[int, list[_LinePatch]] = defaultdict(list)
for token in tokens:
if token.type != "inline" or not token.children:
continue
if not token.map:
continue
line_index = int(token.map[0])
for child in token.children:
if child.type != "link_open":
continue
href = child.attrGet("href")
if not href:
continue
new_href = replacements.get(href)
if not new_href or new_href == href:
continue
patches_by_line[line_index].append(_LinePatch(old=href, new=new_href))
return dict(patches_by_line)
def _rewrite_markdown_links_in_line(self, line: str, patches: list[_LinePatch]) -> str:
if not patches:
return line
patch_index = 0
chars = list(line)
i = 0
in_code = False
code_ticks = 0
while i < len(chars) and patch_index < len(patches):
char = chars[i]
if char == "`" and not self._is_escaped(chars, i):
run = self._count_char_run(chars, i, "`")
if not in_code:
in_code = True
code_ticks = run
elif run == code_ticks:
in_code = False
code_ticks = 0
i += run
continue
if in_code:
i += 1
continue
if char == "[" and not self._is_escaped(chars, i):
if i > 0 and chars[i - 1] == "!":
i += 1
continue
parsed = self._parse_inline_link(chars, i)
if parsed is None:
i += 1
continue
start_url, end_url, parsed_url, close_index = parsed
patch = patches[patch_index]
if parsed_url == patch.old:
replacement = list(patch.new)
chars[start_url:end_url] = replacement
delta = len(replacement) - (end_url - start_url)
close_index += delta
patch_index += 1
i = close_index + 1
continue
i += 1
return "".join(chars)
def _parse_inline_link(self, chars: list[str], open_bracket: int) -> tuple[int, int, str, int] | None:
close_bracket = self._find_link_text_end(chars, open_bracket)
if close_bracket is None:
return None
cursor = close_bracket + 1
while cursor < len(chars) and chars[cursor] in (" ", "\t"):
cursor += 1
if cursor >= len(chars) or chars[cursor] != "(":
return None
close_paren = self._find_matching_paren(chars, cursor)
if close_paren is None:
return None
dest_start = cursor + 1
while dest_start < close_paren and chars[dest_start] in (" ", "\t"):
dest_start += 1
if dest_start >= close_paren:
return None
if chars[dest_start] == "<":
dest_end = dest_start + 1
while dest_end < close_paren and chars[dest_end] != ">":
dest_end += 1
if dest_end >= close_paren:
return None
url_start = dest_start + 1
url_end = dest_end
else:
url_start = dest_start
url_end = self._scan_destination_end(chars, start=dest_start, stop=close_paren)
if url_end <= url_start:
return None
parsed_url = "".join(chars[url_start:url_end])
return url_start, url_end, parsed_url, close_paren
def _find_link_text_end(self, chars: list[str], open_bracket: int) -> int | None:
depth = 1
index = open_bracket + 1
while index < len(chars):
char = chars[index]
if char == "[" and not self._is_escaped(chars, index):
depth += 1
elif char == "]" and not self._is_escaped(chars, index):
depth -= 1
if depth == 0:
return index
index += 1
return None
def _find_matching_paren(self, chars: list[str], open_paren: int) -> int | None:
depth = 1
index = open_paren + 1
in_quote: str | None = None
while index < len(chars):
char = chars[index]
if in_quote is not None:
if char == in_quote and not self._is_escaped(chars, index):
in_quote = None
index += 1
continue
if char in ('"', "'") and not self._is_escaped(chars, index):
in_quote = char
index += 1
continue
if char == "(" and not self._is_escaped(chars, index):
depth += 1
elif char == ")" and not self._is_escaped(chars, index):
depth -= 1
if depth == 0:
return index
index += 1
return None
def _scan_destination_end(self, chars: list[str], start: int, stop: int) -> int:
depth = 0
index = start
while index < stop:
char = chars[index]
if char in (" ", "\t"):
if depth == 0:
break
elif char == "(" and not self._is_escaped(chars, index):
depth += 1
elif char == ")" and not self._is_escaped(chars, index):
if depth == 0:
break
depth -= 1
index += 1
return index
def _is_escaped(self, chars: list[str], pos: int) -> bool:
backslashes = 0
index = pos - 1
while index >= 0 and chars[index] == "\\":
backslashes += 1
index -= 1
return (backslashes % 2) == 1
def _count_char_run(self, chars: list[str], start: int, char: str) -> int:
end = start
while end < len(chars) and chars[end] == char:
end += 1
return end - start

54
mdlink/checker.py Normal file
View File

@@ -0,0 +1,54 @@
from __future__ import annotations
from typing import Iterable
import httpx
from .models import LinkCheckResult
from .utils import unique_preserve_order
class LinkChecker:
def __init__(self, timeout: float = 10.0) -> None:
self._timeout = timeout
self._client = httpx.Client(
follow_redirects=True,
timeout=httpx.Timeout(timeout),
headers={"User-Agent": "mdlink/0.1"},
)
def close(self) -> None:
self._client.close()
def __enter__(self) -> "LinkChecker":
return self
def __exit__(self, exc_type, exc, tb) -> None:
self.close()
def check(self, url: str) -> LinkCheckResult:
try:
response = self._client.get(url)
final_url = str(response.url)
redirected = final_url != url
return LinkCheckResult(
original_url=url,
status_code=response.status_code,
final_url=final_url,
redirected=redirected,
error=None,
)
except httpx.HTTPError as exc:
return LinkCheckResult(
original_url=url,
status_code=None,
final_url=None,
redirected=False,
error=str(exc),
)
def check_many(self, urls: Iterable[str]) -> dict[str, LinkCheckResult]:
results: dict[str, LinkCheckResult] = {}
for url in unique_preserve_order(urls):
results[url] = self.check(url)
return results

127
mdlink/cli.py Normal file
View File

@@ -0,0 +1,127 @@
from __future__ import annotations
import argparse
from collections import defaultdict
from pathlib import Path
from rich.console import Console
from rich.table import Table
from .ast_editor import ASTMarkdownEditor
from .checker import LinkChecker
from .models import LinkCheckResult, LinkRecord
from .scanner import MarkdownScanner
from .utils import unique_preserve_order
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
return parser.parse_args()
def _build_report_table(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> Table:
table = Table(title="Non-200 Links")
table.add_column("file")
table.add_column("line", justify="right")
table.add_column("original URL")
table.add_column("status")
table.add_column("final URL")
for record in records:
result = checks[record.url]
if not result.should_report:
continue
status_value = str(result.status_code) if result.status_code is not None else f"ERR: {result.error}"
table.add_row(
str(record.file_path),
str(record.line),
record.url,
status_value,
result.final_url or "",
)
return table
def _collect_redirects(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> list[tuple[LinkRecord, LinkCheckResult]]:
redirects: list[tuple[LinkRecord, LinkCheckResult]] = []
for record in records:
result = checks[record.url]
if not result.redirected:
continue
if not result.final_url:
continue
redirects.append((record, result))
return redirects
def _handle_rewrites(
redirects: list[tuple[LinkRecord, LinkCheckResult]],
checker: LinkChecker,
editor: ASTMarkdownEditor,
console: Console,
) -> None:
replacements_by_file: dict[Path, dict[str, str]] = defaultdict(dict)
seen_pairs: set[tuple[Path, str, str]] = set()
for record, result in redirects:
if record.kind != "markdown":
continue
final_url = result.final_url
if not final_url:
continue
pair = (record.file_path, record.url, final_url)
if pair in seen_pairs:
continue
seen_pairs.add(pair)
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower()
if answer != "y":
continue
verification = checker.check(final_url)
if verification.status_code != 200:
console.print(
f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})"
)
continue
replacements_by_file[record.file_path][record.url] = final_url
for file_path, replacements in replacements_by_file.items():
content = file_path.read_text(encoding="utf-8")
updated = editor.replace_links(content, replacements)
if updated != content:
file_path.write_text(updated, encoding="utf-8")
console.print(f"[green]Updated[/green] {file_path}")
def main() -> None:
args = parse_args()
console = Console()
scanner = MarkdownScanner()
records = scanner.scan_path(args.path)
if not records:
console.print("No links found.")
return
urls = unique_preserve_order(record.url for record in records)
with LinkChecker(timeout=args.timeout) as checker:
checks = checker.check_many(urls)
table = _build_report_table(records, checks)
if table.row_count:
console.print(table)
else:
console.print("No non-200 links found.")
redirects = _collect_redirects(records, checks)
if redirects:
editor = ASTMarkdownEditor()
_handle_rewrites(redirects=redirects, checker=checker, editor=editor, console=console)
if __name__ == "__main__":
main()

28
mdlink/models.py Normal file
View File

@@ -0,0 +1,28 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
@dataclass(frozen=True)
class LinkRecord:
file_path: Path
line: int
url: str
kind: str
@dataclass(frozen=True)
class LinkCheckResult:
original_url: str
status_code: Optional[int]
final_url: Optional[str]
redirected: bool
error: Optional[str] = None
@property
def should_report(self) -> bool:
if self.error is not None:
return True
return self.status_code != 200

63
mdlink/scanner.py Normal file
View File

@@ -0,0 +1,63 @@
from __future__ import annotations
import re
from pathlib import Path
from typing import Iterable
from markdown_it import MarkdownIt
from markdown_it.token import Token
from .models import LinkRecord
from .utils import is_http_url, iter_markdown_files
NAKED_URL_RE = re.compile(r"https?://[^\s<>()\[\]{}\"']+")
class MarkdownScanner:
def __init__(self) -> None:
self._md = MarkdownIt("commonmark")
def scan_path(self, target: Path) -> list[LinkRecord]:
records: list[LinkRecord] = []
for file_path in iter_markdown_files(target):
content = file_path.read_text(encoding="utf-8")
records.extend(self.scan_content(file_path=file_path, content=content))
return records
def scan_content(self, file_path: Path, content: str) -> list[LinkRecord]:
tokens = self._md.parse(content)
found: list[LinkRecord] = []
for token in tokens:
if token.type != "inline" or not token.children:
continue
line = self._line_from_token(token)
found.extend(self._extract_from_inline(file_path=file_path, line=line, children=token.children))
return found
def _extract_from_inline(self, file_path: Path, line: int, children: Iterable[Token]) -> list[LinkRecord]:
records: list[LinkRecord] = []
in_link_depth = 0
for child in children:
if child.type == "link_open":
in_link_depth += 1
href = child.attrGet("href")
if href and is_http_url(href):
records.append(LinkRecord(file_path=file_path, line=line, url=href, kind="markdown"))
continue
if child.type == "link_close":
in_link_depth = max(0, in_link_depth - 1)
continue
if child.type == "image":
continue
if child.type == "text" and in_link_depth == 0:
for match in NAKED_URL_RE.finditer(child.content):
url = match.group(0).rstrip(".,;:!?")
if is_http_url(url):
records.append(LinkRecord(file_path=file_path, line=line, url=url, kind="naked"))
return records
@staticmethod
def _line_from_token(token: Token) -> int:
if token.map and token.map[0] is not None:
return int(token.map[0]) + 1
return 1

34
mdlink/utils.py Normal file
View File

@@ -0,0 +1,34 @@
from __future__ import annotations
from pathlib import Path
from typing import Iterable, Iterator
from urllib.parse import urlparse
MARKDOWN_EXTENSIONS = {".md"}
def iter_markdown_files(target: Path) -> Iterator[Path]:
if target.is_file():
if target.suffix.lower() in MARKDOWN_EXTENSIONS:
yield target
return
for path in sorted(target.rglob("*.md")):
if path.is_file():
yield path
def is_http_url(url: str) -> bool:
parsed = urlparse(url)
return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
def unique_preserve_order(values: Iterable[str]) -> list[str]:
seen: set[str] = set()
result: list[str] = []
for value in values:
if value in seen:
continue
seen.add(value)
result.append(value)
return result

21
pyproject.toml Normal file
View File

@@ -0,0 +1,21 @@
[project]
name = "mdlink"
version = "0.1.0"
description = "CLI tool to validate Markdown links and optionally rewrite redirects."
readme = "README.md"
authors = [
{name = "drg", email = "gammlaa@chaospott.de"}
]
license = "MIT"
requires-python = ">=3.9"
dependencies = [
"httpx>=0.27.0",
"markdown-it-py>=3.0.0",
"rich>=13.7.0"
]
[project.scripts]
mdlink = "mdlink.cli:main"
[tool.setuptools.packages.find]
include = ["mdlink*"]

22
test.md Normal file
View File

@@ -0,0 +1,22 @@
# mdlink Test
- [ok](https://httpbin.org/status/200)
- [redirect](https://github.com/)
- [broken](https://httpbin.org/status/404)
- Naked: https://httpbin.org/status/500
- ![img](https://httpbin.org/image/png)
## Additional Cases
- [redirect with title](https://github.com/ "GitHub redirect")
- [query and fragment](https://example.com/docs?lang=de#intro)
- [duplicate redirect](https://github.com/)
- [duplicate redirect again](https://github.com/)
- [non-http scheme should be ignored](mailto:team@example.org)
- [ftp should be ignored](ftp://speedtest.tele2.net)
- [inline code URL should not be a markdown link](`https://example.org/code`)
- [image in text should be ignored] text before ![logo](www.uph.de) text after
- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
- [trailing punctuation in sentence] See https://example.org/docs, for details.
- autolink angle brackets: <https://example.org/autolink>
- bare www should be ignored: www.example.org

31
test.sh Executable file
View File

@@ -0,0 +1,31 @@
#!/usr/bin/env bash
set -euo pipefail
OUTPUT_FILE="${1:-test.md}"
cat > "${OUTPUT_FILE}" <<'EOF'
# mdlink Test
- [ok](https://httpbin.org/status/200)
- [redirect](http://github.com)
- [broken](https://httpbin.org/status/404)
- Naked: https://httpbin.org/status/500
- ![img](https://httpbin.org/image/png)
## Additional Cases
- [redirect with title](http://github.com "GitHub redirect")
- [query and fragment](https://example.com/docs?lang=de#intro)
- [duplicate redirect](http://github.com)
- [duplicate redirect again](http://github.com)
- [non-http scheme should be ignored](mailto:team@example.org)
- [ftp should be ignored](ftp://speedtest.tele2.net)
- [inline code URL should not be a markdown link](`https://example.org/code`)
- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after
- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
- [trailing punctuation in sentence] See https://example.org/docs, for details.
- autolink angle brackets: <https://example.org/autolink>
- bare www should be ignored: www.example.org
EOF
echo "Generated ${OUTPUT_FILE}"