init mdlink
This commit is contained in:
43
.gitignore
vendored
Normal file
43
.gitignore
vendored
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# Python bytecode and caches
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
ENV/
|
||||||
|
|
||||||
|
# Build and packaging artifacts
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
*.egg-info/
|
||||||
|
.eggs/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
*.whl
|
||||||
|
|
||||||
|
# Test and coverage outputs
|
||||||
|
.pytest_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.ruff_cache/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
|
||||||
|
# Local tooling/runtime files
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
*.log
|
||||||
|
.codex
|
||||||
|
|
||||||
|
# IDE/editor settings
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# Local test artifacts
|
||||||
|
test_preview.md
|
||||||
52
README.md
Normal file
52
README.md
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
# mdlink
|
||||||
|
|
||||||
|
`mdlink` is a CLI tool to recursively scan Markdown files and validate HTTP/HTTPS links.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Scans `.md` files recursively in a directory (or a single `.md` file).
|
||||||
|
- Extracts Markdown links (`[text](url)`) and naked URLs (`https://...`).
|
||||||
|
- Ignores image links (``).
|
||||||
|
- Checks links with `httpx` and follows redirects automatically.
|
||||||
|
- Ignores `200 OK` in the report; shows non-200 and request errors.
|
||||||
|
- Shows final resolved URL in output.
|
||||||
|
- Asks interactively before rewriting redirected Markdown links.
|
||||||
|
- Re-checks target URL before writing (must still return `200`).
|
||||||
|
- Rewrites links via Markdown AST editing (`markdown-it-py`), not string replacement.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mdlink /path/to/docs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
- `--timeout FLOAT`
|
||||||
|
Per-request timeout in seconds (default: `10.0`).
|
||||||
|
|
||||||
|
## Interactive Redirect Rewrite
|
||||||
|
|
||||||
|
When a Markdown link redirects, `mdlink` prompts:
|
||||||
|
|
||||||
|
```text
|
||||||
|
Replace old URL with final URL? [y/N]
|
||||||
|
```
|
||||||
|
|
||||||
|
- [broken](https://httpbin.org/status/404)
|
||||||
|
|
||||||
|
Only confirmed links are updated.
|
||||||
|
|
||||||
|
## Help
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mdlink --help
|
||||||
|
```
|
||||||
5
mdlink/__init__.py
Normal file
5
mdlink/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"""mdlink package."""
|
||||||
|
|
||||||
|
from .cli import main
|
||||||
|
|
||||||
|
__all__ = ["main"]
|
||||||
217
mdlink/ast_editor.py
Normal file
217
mdlink/ast_editor.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import DefaultDict
|
||||||
|
|
||||||
|
from markdown_it import MarkdownIt
|
||||||
|
from markdown_it.token import Token
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _LinePatch:
|
||||||
|
old: str
|
||||||
|
new: str
|
||||||
|
|
||||||
|
|
||||||
|
class ASTMarkdownEditor:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._md = MarkdownIt("commonmark")
|
||||||
|
|
||||||
|
def replace_links(self, content: str, replacements: dict[str, str]) -> str:
|
||||||
|
if not replacements:
|
||||||
|
return content
|
||||||
|
|
||||||
|
tokens = self._md.parse(content)
|
||||||
|
patches_by_line = self._collect_line_patches(tokens=tokens, replacements=replacements)
|
||||||
|
if not patches_by_line:
|
||||||
|
return content
|
||||||
|
|
||||||
|
lines = content.splitlines(keepends=True)
|
||||||
|
for line_index, patches in patches_by_line.items():
|
||||||
|
if line_index < 0 or line_index >= len(lines):
|
||||||
|
continue
|
||||||
|
lines[line_index] = self._rewrite_markdown_links_in_line(lines[line_index], patches)
|
||||||
|
return "".join(lines)
|
||||||
|
|
||||||
|
def _collect_line_patches(
|
||||||
|
self,
|
||||||
|
tokens: list[Token],
|
||||||
|
replacements: dict[str, str],
|
||||||
|
) -> dict[int, list[_LinePatch]]:
|
||||||
|
patches_by_line: DefaultDict[int, list[_LinePatch]] = defaultdict(list)
|
||||||
|
for token in tokens:
|
||||||
|
if token.type != "inline" or not token.children:
|
||||||
|
continue
|
||||||
|
if not token.map:
|
||||||
|
continue
|
||||||
|
line_index = int(token.map[0])
|
||||||
|
for child in token.children:
|
||||||
|
if child.type != "link_open":
|
||||||
|
continue
|
||||||
|
href = child.attrGet("href")
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
new_href = replacements.get(href)
|
||||||
|
if not new_href or new_href == href:
|
||||||
|
continue
|
||||||
|
patches_by_line[line_index].append(_LinePatch(old=href, new=new_href))
|
||||||
|
return dict(patches_by_line)
|
||||||
|
|
||||||
|
def _rewrite_markdown_links_in_line(self, line: str, patches: list[_LinePatch]) -> str:
|
||||||
|
if not patches:
|
||||||
|
return line
|
||||||
|
|
||||||
|
patch_index = 0
|
||||||
|
chars = list(line)
|
||||||
|
i = 0
|
||||||
|
in_code = False
|
||||||
|
code_ticks = 0
|
||||||
|
|
||||||
|
while i < len(chars) and patch_index < len(patches):
|
||||||
|
char = chars[i]
|
||||||
|
|
||||||
|
if char == "`" and not self._is_escaped(chars, i):
|
||||||
|
run = self._count_char_run(chars, i, "`")
|
||||||
|
if not in_code:
|
||||||
|
in_code = True
|
||||||
|
code_ticks = run
|
||||||
|
elif run == code_ticks:
|
||||||
|
in_code = False
|
||||||
|
code_ticks = 0
|
||||||
|
i += run
|
||||||
|
continue
|
||||||
|
|
||||||
|
if in_code:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if char == "[" and not self._is_escaped(chars, i):
|
||||||
|
if i > 0 and chars[i - 1] == "!":
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
parsed = self._parse_inline_link(chars, i)
|
||||||
|
if parsed is None:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
start_url, end_url, parsed_url, close_index = parsed
|
||||||
|
patch = patches[patch_index]
|
||||||
|
if parsed_url == patch.old:
|
||||||
|
replacement = list(patch.new)
|
||||||
|
chars[start_url:end_url] = replacement
|
||||||
|
delta = len(replacement) - (end_url - start_url)
|
||||||
|
close_index += delta
|
||||||
|
patch_index += 1
|
||||||
|
i = close_index + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return "".join(chars)
|
||||||
|
|
||||||
|
def _parse_inline_link(self, chars: list[str], open_bracket: int) -> tuple[int, int, str, int] | None:
|
||||||
|
close_bracket = self._find_link_text_end(chars, open_bracket)
|
||||||
|
if close_bracket is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
cursor = close_bracket + 1
|
||||||
|
while cursor < len(chars) and chars[cursor] in (" ", "\t"):
|
||||||
|
cursor += 1
|
||||||
|
if cursor >= len(chars) or chars[cursor] != "(":
|
||||||
|
return None
|
||||||
|
|
||||||
|
close_paren = self._find_matching_paren(chars, cursor)
|
||||||
|
if close_paren is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
dest_start = cursor + 1
|
||||||
|
while dest_start < close_paren and chars[dest_start] in (" ", "\t"):
|
||||||
|
dest_start += 1
|
||||||
|
if dest_start >= close_paren:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if chars[dest_start] == "<":
|
||||||
|
dest_end = dest_start + 1
|
||||||
|
while dest_end < close_paren and chars[dest_end] != ">":
|
||||||
|
dest_end += 1
|
||||||
|
if dest_end >= close_paren:
|
||||||
|
return None
|
||||||
|
url_start = dest_start + 1
|
||||||
|
url_end = dest_end
|
||||||
|
else:
|
||||||
|
url_start = dest_start
|
||||||
|
url_end = self._scan_destination_end(chars, start=dest_start, stop=close_paren)
|
||||||
|
if url_end <= url_start:
|
||||||
|
return None
|
||||||
|
|
||||||
|
parsed_url = "".join(chars[url_start:url_end])
|
||||||
|
return url_start, url_end, parsed_url, close_paren
|
||||||
|
|
||||||
|
def _find_link_text_end(self, chars: list[str], open_bracket: int) -> int | None:
|
||||||
|
depth = 1
|
||||||
|
index = open_bracket + 1
|
||||||
|
while index < len(chars):
|
||||||
|
char = chars[index]
|
||||||
|
if char == "[" and not self._is_escaped(chars, index):
|
||||||
|
depth += 1
|
||||||
|
elif char == "]" and not self._is_escaped(chars, index):
|
||||||
|
depth -= 1
|
||||||
|
if depth == 0:
|
||||||
|
return index
|
||||||
|
index += 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _find_matching_paren(self, chars: list[str], open_paren: int) -> int | None:
|
||||||
|
depth = 1
|
||||||
|
index = open_paren + 1
|
||||||
|
in_quote: str | None = None
|
||||||
|
while index < len(chars):
|
||||||
|
char = chars[index]
|
||||||
|
if in_quote is not None:
|
||||||
|
if char == in_quote and not self._is_escaped(chars, index):
|
||||||
|
in_quote = None
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
if char in ('"', "'") and not self._is_escaped(chars, index):
|
||||||
|
in_quote = char
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
if char == "(" and not self._is_escaped(chars, index):
|
||||||
|
depth += 1
|
||||||
|
elif char == ")" and not self._is_escaped(chars, index):
|
||||||
|
depth -= 1
|
||||||
|
if depth == 0:
|
||||||
|
return index
|
||||||
|
index += 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _scan_destination_end(self, chars: list[str], start: int, stop: int) -> int:
|
||||||
|
depth = 0
|
||||||
|
index = start
|
||||||
|
while index < stop:
|
||||||
|
char = chars[index]
|
||||||
|
if char in (" ", "\t"):
|
||||||
|
if depth == 0:
|
||||||
|
break
|
||||||
|
elif char == "(" and not self._is_escaped(chars, index):
|
||||||
|
depth += 1
|
||||||
|
elif char == ")" and not self._is_escaped(chars, index):
|
||||||
|
if depth == 0:
|
||||||
|
break
|
||||||
|
depth -= 1
|
||||||
|
index += 1
|
||||||
|
return index
|
||||||
|
|
||||||
|
def _is_escaped(self, chars: list[str], pos: int) -> bool:
|
||||||
|
backslashes = 0
|
||||||
|
index = pos - 1
|
||||||
|
while index >= 0 and chars[index] == "\\":
|
||||||
|
backslashes += 1
|
||||||
|
index -= 1
|
||||||
|
return (backslashes % 2) == 1
|
||||||
|
|
||||||
|
def _count_char_run(self, chars: list[str], start: int, char: str) -> int:
|
||||||
|
end = start
|
||||||
|
while end < len(chars) and chars[end] == char:
|
||||||
|
end += 1
|
||||||
|
return end - start
|
||||||
54
mdlink/checker.py
Normal file
54
mdlink/checker.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from .models import LinkCheckResult
|
||||||
|
from .utils import unique_preserve_order
|
||||||
|
|
||||||
|
|
||||||
|
class LinkChecker:
|
||||||
|
def __init__(self, timeout: float = 10.0) -> None:
|
||||||
|
self._timeout = timeout
|
||||||
|
self._client = httpx.Client(
|
||||||
|
follow_redirects=True,
|
||||||
|
timeout=httpx.Timeout(timeout),
|
||||||
|
headers={"User-Agent": "mdlink/0.1"},
|
||||||
|
)
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
self._client.close()
|
||||||
|
|
||||||
|
def __enter__(self) -> "LinkChecker":
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc, tb) -> None:
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
def check(self, url: str) -> LinkCheckResult:
|
||||||
|
try:
|
||||||
|
response = self._client.get(url)
|
||||||
|
final_url = str(response.url)
|
||||||
|
redirected = final_url != url
|
||||||
|
return LinkCheckResult(
|
||||||
|
original_url=url,
|
||||||
|
status_code=response.status_code,
|
||||||
|
final_url=final_url,
|
||||||
|
redirected=redirected,
|
||||||
|
error=None,
|
||||||
|
)
|
||||||
|
except httpx.HTTPError as exc:
|
||||||
|
return LinkCheckResult(
|
||||||
|
original_url=url,
|
||||||
|
status_code=None,
|
||||||
|
final_url=None,
|
||||||
|
redirected=False,
|
||||||
|
error=str(exc),
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_many(self, urls: Iterable[str]) -> dict[str, LinkCheckResult]:
|
||||||
|
results: dict[str, LinkCheckResult] = {}
|
||||||
|
for url in unique_preserve_order(urls):
|
||||||
|
results[url] = self.check(url)
|
||||||
|
return results
|
||||||
127
mdlink/cli.py
Normal file
127
mdlink/cli.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
from .ast_editor import ASTMarkdownEditor
|
||||||
|
from .checker import LinkChecker
|
||||||
|
from .models import LinkCheckResult, LinkRecord
|
||||||
|
from .scanner import MarkdownScanner
|
||||||
|
from .utils import unique_preserve_order
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
|
||||||
|
parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
|
||||||
|
parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def _build_report_table(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> Table:
|
||||||
|
table = Table(title="Non-200 Links")
|
||||||
|
table.add_column("file")
|
||||||
|
table.add_column("line", justify="right")
|
||||||
|
table.add_column("original URL")
|
||||||
|
table.add_column("status")
|
||||||
|
table.add_column("final URL")
|
||||||
|
|
||||||
|
for record in records:
|
||||||
|
result = checks[record.url]
|
||||||
|
if not result.should_report:
|
||||||
|
continue
|
||||||
|
status_value = str(result.status_code) if result.status_code is not None else f"ERR: {result.error}"
|
||||||
|
table.add_row(
|
||||||
|
str(record.file_path),
|
||||||
|
str(record.line),
|
||||||
|
record.url,
|
||||||
|
status_value,
|
||||||
|
result.final_url or "",
|
||||||
|
)
|
||||||
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_redirects(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> list[tuple[LinkRecord, LinkCheckResult]]:
|
||||||
|
redirects: list[tuple[LinkRecord, LinkCheckResult]] = []
|
||||||
|
for record in records:
|
||||||
|
result = checks[record.url]
|
||||||
|
if not result.redirected:
|
||||||
|
continue
|
||||||
|
if not result.final_url:
|
||||||
|
continue
|
||||||
|
redirects.append((record, result))
|
||||||
|
return redirects
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_rewrites(
|
||||||
|
redirects: list[tuple[LinkRecord, LinkCheckResult]],
|
||||||
|
checker: LinkChecker,
|
||||||
|
editor: ASTMarkdownEditor,
|
||||||
|
console: Console,
|
||||||
|
) -> None:
|
||||||
|
replacements_by_file: dict[Path, dict[str, str]] = defaultdict(dict)
|
||||||
|
seen_pairs: set[tuple[Path, str, str]] = set()
|
||||||
|
|
||||||
|
for record, result in redirects:
|
||||||
|
if record.kind != "markdown":
|
||||||
|
continue
|
||||||
|
final_url = result.final_url
|
||||||
|
if not final_url:
|
||||||
|
continue
|
||||||
|
pair = (record.file_path, record.url, final_url)
|
||||||
|
if pair in seen_pairs:
|
||||||
|
continue
|
||||||
|
seen_pairs.add(pair)
|
||||||
|
|
||||||
|
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||||
|
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||||
|
answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower()
|
||||||
|
if answer != "y":
|
||||||
|
continue
|
||||||
|
|
||||||
|
verification = checker.check(final_url)
|
||||||
|
if verification.status_code != 200:
|
||||||
|
console.print(
|
||||||
|
f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
replacements_by_file[record.file_path][record.url] = final_url
|
||||||
|
|
||||||
|
for file_path, replacements in replacements_by_file.items():
|
||||||
|
content = file_path.read_text(encoding="utf-8")
|
||||||
|
updated = editor.replace_links(content, replacements)
|
||||||
|
if updated != content:
|
||||||
|
file_path.write_text(updated, encoding="utf-8")
|
||||||
|
console.print(f"[green]Updated[/green] {file_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
scanner = MarkdownScanner()
|
||||||
|
records = scanner.scan_path(args.path)
|
||||||
|
if not records:
|
||||||
|
console.print("No links found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
urls = unique_preserve_order(record.url for record in records)
|
||||||
|
with LinkChecker(timeout=args.timeout) as checker:
|
||||||
|
checks = checker.check_many(urls)
|
||||||
|
table = _build_report_table(records, checks)
|
||||||
|
if table.row_count:
|
||||||
|
console.print(table)
|
||||||
|
else:
|
||||||
|
console.print("No non-200 links found.")
|
||||||
|
|
||||||
|
redirects = _collect_redirects(records, checks)
|
||||||
|
if redirects:
|
||||||
|
editor = ASTMarkdownEditor()
|
||||||
|
_handle_rewrites(redirects=redirects, checker=checker, editor=editor, console=console)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
28
mdlink/models.py
Normal file
28
mdlink/models.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class LinkRecord:
|
||||||
|
file_path: Path
|
||||||
|
line: int
|
||||||
|
url: str
|
||||||
|
kind: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class LinkCheckResult:
|
||||||
|
original_url: str
|
||||||
|
status_code: Optional[int]
|
||||||
|
final_url: Optional[str]
|
||||||
|
redirected: bool
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def should_report(self) -> bool:
|
||||||
|
if self.error is not None:
|
||||||
|
return True
|
||||||
|
return self.status_code != 200
|
||||||
63
mdlink/scanner.py
Normal file
63
mdlink/scanner.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from markdown_it import MarkdownIt
|
||||||
|
from markdown_it.token import Token
|
||||||
|
|
||||||
|
from .models import LinkRecord
|
||||||
|
from .utils import is_http_url, iter_markdown_files
|
||||||
|
|
||||||
|
NAKED_URL_RE = re.compile(r"https?://[^\s<>()\[\]{}\"']+")
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownScanner:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._md = MarkdownIt("commonmark")
|
||||||
|
|
||||||
|
def scan_path(self, target: Path) -> list[LinkRecord]:
|
||||||
|
records: list[LinkRecord] = []
|
||||||
|
for file_path in iter_markdown_files(target):
|
||||||
|
content = file_path.read_text(encoding="utf-8")
|
||||||
|
records.extend(self.scan_content(file_path=file_path, content=content))
|
||||||
|
return records
|
||||||
|
|
||||||
|
def scan_content(self, file_path: Path, content: str) -> list[LinkRecord]:
|
||||||
|
tokens = self._md.parse(content)
|
||||||
|
found: list[LinkRecord] = []
|
||||||
|
for token in tokens:
|
||||||
|
if token.type != "inline" or not token.children:
|
||||||
|
continue
|
||||||
|
line = self._line_from_token(token)
|
||||||
|
found.extend(self._extract_from_inline(file_path=file_path, line=line, children=token.children))
|
||||||
|
return found
|
||||||
|
|
||||||
|
def _extract_from_inline(self, file_path: Path, line: int, children: Iterable[Token]) -> list[LinkRecord]:
|
||||||
|
records: list[LinkRecord] = []
|
||||||
|
in_link_depth = 0
|
||||||
|
for child in children:
|
||||||
|
if child.type == "link_open":
|
||||||
|
in_link_depth += 1
|
||||||
|
href = child.attrGet("href")
|
||||||
|
if href and is_http_url(href):
|
||||||
|
records.append(LinkRecord(file_path=file_path, line=line, url=href, kind="markdown"))
|
||||||
|
continue
|
||||||
|
if child.type == "link_close":
|
||||||
|
in_link_depth = max(0, in_link_depth - 1)
|
||||||
|
continue
|
||||||
|
if child.type == "image":
|
||||||
|
continue
|
||||||
|
if child.type == "text" and in_link_depth == 0:
|
||||||
|
for match in NAKED_URL_RE.finditer(child.content):
|
||||||
|
url = match.group(0).rstrip(".,;:!?")
|
||||||
|
if is_http_url(url):
|
||||||
|
records.append(LinkRecord(file_path=file_path, line=line, url=url, kind="naked"))
|
||||||
|
return records
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _line_from_token(token: Token) -> int:
|
||||||
|
if token.map and token.map[0] is not None:
|
||||||
|
return int(token.map[0]) + 1
|
||||||
|
return 1
|
||||||
34
mdlink/utils.py
Normal file
34
mdlink/utils.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Iterator
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
MARKDOWN_EXTENSIONS = {".md"}
|
||||||
|
|
||||||
|
|
||||||
|
def iter_markdown_files(target: Path) -> Iterator[Path]:
|
||||||
|
if target.is_file():
|
||||||
|
if target.suffix.lower() in MARKDOWN_EXTENSIONS:
|
||||||
|
yield target
|
||||||
|
return
|
||||||
|
for path in sorted(target.rglob("*.md")):
|
||||||
|
if path.is_file():
|
||||||
|
yield path
|
||||||
|
|
||||||
|
|
||||||
|
def is_http_url(url: str) -> bool:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
|
||||||
|
|
||||||
|
|
||||||
|
def unique_preserve_order(values: Iterable[str]) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
result: list[str] = []
|
||||||
|
for value in values:
|
||||||
|
if value in seen:
|
||||||
|
continue
|
||||||
|
seen.add(value)
|
||||||
|
result.append(value)
|
||||||
|
return result
|
||||||
21
pyproject.toml
Normal file
21
pyproject.toml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
[project]
|
||||||
|
name = "mdlink"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "CLI tool to validate Markdown links and optionally rewrite redirects."
|
||||||
|
readme = "README.md"
|
||||||
|
authors = [
|
||||||
|
{name = "drg", email = "gammlaa@chaospott.de"}
|
||||||
|
]
|
||||||
|
license = "MIT"
|
||||||
|
requires-python = ">=3.9"
|
||||||
|
dependencies = [
|
||||||
|
"httpx>=0.27.0",
|
||||||
|
"markdown-it-py>=3.0.0",
|
||||||
|
"rich>=13.7.0"
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
mdlink = "mdlink.cli:main"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
include = ["mdlink*"]
|
||||||
22
test.md
Normal file
22
test.md
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# mdlink Test
|
||||||
|
|
||||||
|
- [ok](https://httpbin.org/status/200)
|
||||||
|
- [redirect](https://github.com/)
|
||||||
|
- [broken](https://httpbin.org/status/404)
|
||||||
|
- Naked: https://httpbin.org/status/500
|
||||||
|
- 
|
||||||
|
|
||||||
|
## Additional Cases
|
||||||
|
|
||||||
|
- [redirect with title](https://github.com/ "GitHub redirect")
|
||||||
|
- [query and fragment](https://example.com/docs?lang=de#intro)
|
||||||
|
- [duplicate redirect](https://github.com/)
|
||||||
|
- [duplicate redirect again](https://github.com/)
|
||||||
|
- [non-http scheme should be ignored](mailto:team@example.org)
|
||||||
|
- [ftp should be ignored](ftp://speedtest.tele2.net)
|
||||||
|
- [inline code URL should not be a markdown link](`https://example.org/code`)
|
||||||
|
- [image in text should be ignored] text before  text after
|
||||||
|
- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
|
||||||
|
- [trailing punctuation in sentence] See https://example.org/docs, for details.
|
||||||
|
- autolink angle brackets: <https://example.org/autolink>
|
||||||
|
- bare www should be ignored: www.example.org
|
||||||
31
test.sh
Executable file
31
test.sh
Executable file
@@ -0,0 +1,31 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
OUTPUT_FILE="${1:-test.md}"
|
||||||
|
|
||||||
|
cat > "${OUTPUT_FILE}" <<'EOF'
|
||||||
|
# mdlink Test
|
||||||
|
|
||||||
|
- [ok](https://httpbin.org/status/200)
|
||||||
|
- [redirect](http://github.com)
|
||||||
|
- [broken](https://httpbin.org/status/404)
|
||||||
|
- Naked: https://httpbin.org/status/500
|
||||||
|
- 
|
||||||
|
|
||||||
|
## Additional Cases
|
||||||
|
|
||||||
|
- [redirect with title](http://github.com "GitHub redirect")
|
||||||
|
- [query and fragment](https://example.com/docs?lang=de#intro)
|
||||||
|
- [duplicate redirect](http://github.com)
|
||||||
|
- [duplicate redirect again](http://github.com)
|
||||||
|
- [non-http scheme should be ignored](mailto:team@example.org)
|
||||||
|
- [ftp should be ignored](ftp://speedtest.tele2.net)
|
||||||
|
- [inline code URL should not be a markdown link](`https://example.org/code`)
|
||||||
|
- [image in text should be ignored] text before  text after
|
||||||
|
- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
|
||||||
|
- [trailing punctuation in sentence] See https://example.org/docs, for details.
|
||||||
|
- autolink angle brackets: <https://example.org/autolink>
|
||||||
|
- bare www should be ignored: www.example.org
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "Generated ${OUTPUT_FILE}"
|
||||||
Reference in New Issue
Block a user