feat: add incremental file scan state with optional --rescan full run

This commit is contained in:
2026-04-17 19:51:07 +02:00
parent c8d4128b79
commit 1583871cf9
5 changed files with 153 additions and 59 deletions
+1
View File
@@ -32,6 +32,7 @@ htmlcov/
Thumbs.db
*.log
.codex
.mdlink-state.json
# IDE/editor settings
.idea/
+4
View File
@@ -38,9 +38,13 @@ mdlink .
- `--timeout FLOAT`
Per-request timeout in seconds (default: `10.0`).
- `--rescan`
Discard `.mdlink-state.json` and run a full scan on all matching files.
- `--check CODE`
Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs.
## Interactive Redirect Rewrite
Step 1: When a Markdown link redirects, `mdlink` prompts:
+6 -3
View File
@@ -29,12 +29,14 @@ class LinkChecker:
def check(self, url: str) -> LinkCheckResult:
try:
response = self._client.get(url)
original_status = response.history[0].status_code if response.history else response.status_code
initial_status = response.history[0].status_code if response.history else response.status_code
final_status = response.status_code
final_url = str(response.url)
redirected = final_url != url
return LinkCheckResult(
original_url=url,
status_code=original_status,
initial_status_code=initial_status,
final_status_code=final_status,
final_url=final_url,
redirected=redirected,
error=None,
@@ -42,7 +44,8 @@ class LinkChecker:
except httpx.HTTPError as exc:
return LinkCheckResult(
original_url=url,
status_code=None,
initial_status_code=None,
final_status_code=None,
final_url=None,
redirected=False,
error=str(exc),
+134 -54
View File
@@ -1,6 +1,7 @@
from __future__ import annotations
import argparse
import json
from collections import defaultdict
from pathlib import Path
from typing import Optional
@@ -12,13 +13,24 @@ from .ast_editor import ASTMarkdownEditor
from .checker import LinkChecker
from .models import LinkCheckResult, LinkRecord
from .scanner import MarkdownScanner
from .utils import unique_preserve_order
from .utils import iter_markdown_files, unique_preserve_order
STATE_FILE = Path(".mdlink-state.json")
class RewriteAborted(Exception):
"""Raised when user aborts interactive rewrite prompts."""
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
parser.add_argument(
"--rescan",
action="store_true",
help="Discard existing scan state and rescan all matching files.",
)
parser.add_argument(
"--check",
dest="check_codes",
@@ -30,6 +42,28 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args()
def _normalize_state_key(path: Path) -> str:
return str(path)
def _load_state(path: Path) -> set[str]:
if not path.exists():
return set()
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
return set()
files = payload.get("files")
if not isinstance(files, list):
return set()
return {item for item in files if isinstance(item, str)}
def _save_state(path: Path, scanned_files: set[str]) -> None:
payload = {"files": sorted(scanned_files)}
path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
if check_codes:
return result.status_code is not None and result.status_code in check_codes
@@ -128,6 +162,14 @@ def _collect_https_candidates(
return candidates
def _prompt_yes_no(console: Console, prompt: str) -> bool:
try:
answer = console.input(prompt).strip().lower()
except KeyboardInterrupt as exc:
raise RewriteAborted from exc
return answer == "y"
def _handle_rewrites(
records: list[LinkRecord],
checks: dict[str, LinkCheckResult],
@@ -141,66 +183,72 @@ def _handle_rewrites(
seen_pairs: set[tuple[Path, str, str]] = set()
check_cache: dict[str, LinkCheckResult] = {}
if redirects:
console.print("\n[bold]Redirect replacements[/bold]")
try:
if redirects:
console.print("\n[bold]Redirect replacements[/bold]")
for record, result in redirects:
if record.kind != "markdown":
continue
final_url = result.final_url
if not final_url:
continue
pair = (record.file_path, record.url, final_url)
if pair in seen_pairs:
continue
seen_pairs.add(pair)
for record, result in redirects:
if record.kind != "markdown":
continue
final_url = result.final_url
if not final_url:
continue
pair = (record.file_path, record.url, final_url)
if pair in seen_pairs:
continue
seen_pairs.add(pair)
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower()
if answer != "y":
continue
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
continue
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
if verification.status_code != 200:
console.print(
f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})"
)
continue
replacements_by_file[record.file_path][record.url] = final_url
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
if verification.final_status_code != 200:
console.print(
"[red]Skip:[/red] New URL returned "
f"({verification.final_status_code or verification.error})."
)
continue
console.print("[green]Done:[/green] New URL returned (200).")
replacements_by_file[record.file_path][record.url] = final_url
https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes)
if https_candidates:
console.print("\n[bold]HTTPS upgrade candidates[/bold]")
https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes)
if https_candidates:
console.print("\n[bold]HTTPS upgrade candidates[/bold]")
for record, https_url in https_candidates:
if replacements_by_file[record.file_path].get(record.url):
continue
https_check = _cached_check(checker=checker, cache=check_cache, url=https_url)
if https_check.status_code != 200:
continue
final_url: Optional[str] = https_check.final_url or https_url
if final_url == record.url:
continue
for record, https_url in https_candidates:
if replacements_by_file[record.file_path].get(record.url):
continue
https_check = _cached_check(checker=checker, cache=check_cache, url=https_url)
if https_check.final_status_code != 200:
continue
final_url: Optional[str] = https_check.final_url or https_url
if final_url == record.url:
continue
pair = (record.file_path, record.url, final_url)
if pair in seen_pairs:
continue
seen_pairs.add(pair)
pair = (record.file_path, record.url, final_url)
if pair in seen_pairs:
continue
seen_pairs.add(pair)
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
answer = console.input("Replace HTTP URL with HTTPS variant? [y/N] ").strip().lower()
if answer != "y":
continue
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
continue
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
if verification.status_code != 200:
console.print(
f"[red]Skip:[/red] HTTPS URL no longer valid ({verification.status_code or verification.error})"
)
continue
replacements_by_file[record.file_path][record.url] = final_url
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
if verification.final_status_code != 200:
console.print(
"[red]Skip:[/red] New URL returned "
f"({verification.final_status_code or verification.error})."
)
continue
console.print("[green]Done:[/green] New URL returned (200).")
replacements_by_file[record.file_path][record.url] = final_url
except RewriteAborted:
console.print("\n[yellow]Aborted by user during rewrite prompts. Discarded pending changes.[/yellow]")
return
for file_path, replacements in replacements_by_file.items():
content = file_path.read_text(encoding="utf-8")
@@ -216,7 +264,39 @@ def main() -> None:
check_codes = set(args.check_codes) if args.check_codes else None
scanner = MarkdownScanner()
records = scanner.scan_path(args.path)
all_files = list(iter_markdown_files(args.path))
if not all_files:
console.print("No Markdown files found.")
return
old_state = set() if args.rescan else _load_state(STATE_FILE)
files_to_scan: list[Path] = []
skipped_count = 0
for file_path in all_files:
state_key = _normalize_state_key(file_path)
if state_key in old_state:
skipped_count += 1
continue
files_to_scan.append(file_path)
console.print(
f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}"
)
if not files_to_scan:
console.print("No new files to scan. Use --rescan to force a full scan.")
return
records: list[LinkRecord] = []
for file_path in files_to_scan:
content = file_path.read_text(encoding="utf-8")
records.extend(scanner.scan_content(file_path=file_path, content=content))
new_state = set(old_state)
for file_path in files_to_scan:
new_state.add(_normalize_state_key(file_path))
_save_state(STATE_FILE, new_state)
if not records:
console.print("No links found.")
return
+8 -2
View File
@@ -16,13 +16,19 @@ class LinkRecord:
@dataclass(frozen=True)
class LinkCheckResult:
original_url: str
status_code: Optional[int]
initial_status_code: Optional[int]
final_status_code: Optional[int]
final_url: Optional[str]
redirected: bool
error: Optional[str] = None
@property
def status_code(self) -> Optional[int]:
# Backward-compatible alias used by reporting and --check filtering.
return self.initial_status_code
@property
def should_report(self) -> bool:
if self.error is not None:
return True
return self.status_code != 200
return self.initial_status_code != 200