feat: add incremental file scan state with optional --rescan full run

2026-04-17 19:51:07 +02:00
parent c8d4128b79
commit 1583871cf9
5 changed files with 153 additions and 59 deletions
@@ -32,6 +32,7 @@ htmlcov/
 Thumbs.db
 *.log
 .codex
 .mdlink-state.json
 # IDE/editor settings
 .idea/
@@ -38,9 +38,13 @@ mdlink .
 - `--timeout FLOAT`  
  Per-request timeout in seconds (default: `10.0`).
 - `--rescan`  
  Discard `.mdlink-state.json` and run a full scan on all matching files.
 - `--check CODE`  
  Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
 By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs.
 ## Interactive Redirect Rewrite
 Step 1: When a Markdown link redirects, `mdlink` prompts:
@@ -29,12 +29,14 @@ class LinkChecker:
    def check(self, url: str) -> LinkCheckResult:
        try:
            response = self._client.get(url)
-            original_status = response.history[0].status_code if response.history else response.status_code
+            initial_status = response.history[0].status_code if response.history else response.status_code
            final_status = response.status_code
            final_url = str(response.url)
            redirected = final_url != url
            return LinkCheckResult(
                original_url=url,
-                status_code=original_status,
+                initial_status_code=initial_status,
                final_status_code=final_status,
                final_url=final_url,
                redirected=redirected,
                error=None,
@@ -42,7 +44,8 @@ class LinkChecker:
        except httpx.HTTPError as exc:
            return LinkCheckResult(
                original_url=url,
-                status_code=None,
+                initial_status_code=None,
                final_status_code=None,
                final_url=None,
                redirected=False,
                error=str(exc),
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import argparse
 import json
 from collections import defaultdict
 from pathlib import Path
 from typing import Optional
@@ -12,13 +13,24 @@ from .ast_editor import ASTMarkdownEditor
 from .checker import LinkChecker
 from .models import LinkCheckResult, LinkRecord
 from .scanner import MarkdownScanner
-from .utils import unique_preserve_order
+from .utils import iter_markdown_files, unique_preserve_order
 STATE_FILE = Path(".mdlink-state.json")
 class RewriteAborted(Exception):
    """Raised when user aborts interactive rewrite prompts."""
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
    parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
    parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
    parser.add_argument(
        "--rescan",
        action="store_true",
        help="Discard existing scan state and rescan all matching files.",
    )
    parser.add_argument(
        "--check",
        dest="check_codes",
@@ -30,6 +42,28 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()
 def _normalize_state_key(path: Path) -> str:
    return str(path)
 def _load_state(path: Path) -> set[str]:
    if not path.exists():
        return set()
    try:
        payload = json.loads(path.read_text(encoding="utf-8"))
    except (OSError, json.JSONDecodeError):
        return set()
    files = payload.get("files")
    if not isinstance(files, list):
        return set()
    return {item for item in files if isinstance(item, str)}
 def _save_state(path: Path, scanned_files: set[str]) -> None:
    payload = {"files": sorted(scanned_files)}
    path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
 def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
    if check_codes:
        return result.status_code is not None and result.status_code in check_codes
@@ -128,6 +162,14 @@ def _collect_https_candidates(
    return candidates
 def _prompt_yes_no(console: Console, prompt: str) -> bool:
    try:
        answer = console.input(prompt).strip().lower()
    except KeyboardInterrupt as exc:
        raise RewriteAborted from exc
    return answer == "y"
 def _handle_rewrites(
    records: list[LinkRecord],
    checks: dict[str, LinkCheckResult],
@@ -141,6 +183,7 @@ def _handle_rewrites(
    seen_pairs: set[tuple[Path, str, str]] = set()
    check_cache: dict[str, LinkCheckResult] = {}
    try:
        if redirects:
            console.print("\n[bold]Redirect replacements[/bold]")
@@ -157,16 +200,17 @@ def _handle_rewrites(
            console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
            console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
-        answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower()
+            if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
        if answer != "y":
                continue
            verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
-        if verification.status_code != 200:
+            if verification.final_status_code != 200:
                console.print(
-                f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})"
+                    "[red]Skip:[/red] New URL returned "
                    f"({verification.final_status_code or verification.error})."
                )
                continue
            console.print("[green]Done:[/green] New URL returned (200).")
            replacements_by_file[record.file_path][record.url] = final_url
        https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes)
@@ -177,7 +221,7 @@ def _handle_rewrites(
            if replacements_by_file[record.file_path].get(record.url):
                continue
            https_check = _cached_check(checker=checker, cache=check_cache, url=https_url)
-        if https_check.status_code != 200:
+            if https_check.final_status_code != 200:
                continue
            final_url: Optional[str] = https_check.final_url or https_url
            if final_url == record.url:
@@ -190,17 +234,21 @@ def _handle_rewrites(
            console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
            console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
-        answer = console.input("Replace HTTP URL with HTTPS variant? [y/N] ").strip().lower()
+            if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
        if answer != "y":
                continue
            verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
-        if verification.status_code != 200:
+            if verification.final_status_code != 200:
                console.print(
-                f"[red]Skip:[/red] HTTPS URL no longer valid ({verification.status_code or verification.error})"
+                    "[red]Skip:[/red] New URL returned "
                    f"({verification.final_status_code or verification.error})."
                )
                continue
            console.print("[green]Done:[/green] New URL returned (200).")
            replacements_by_file[record.file_path][record.url] = final_url
    except RewriteAborted:
        console.print("\n[yellow]Aborted by user during rewrite prompts. Discarded pending changes.[/yellow]")
        return
    for file_path, replacements in replacements_by_file.items():
        content = file_path.read_text(encoding="utf-8")
@@ -216,7 +264,39 @@ def main() -> None:
    check_codes = set(args.check_codes) if args.check_codes else None
    scanner = MarkdownScanner()
-    records = scanner.scan_path(args.path)
+    all_files = list(iter_markdown_files(args.path))
    if not all_files:
        console.print("No Markdown files found.")
        return
    old_state = set() if args.rescan else _load_state(STATE_FILE)
    files_to_scan: list[Path] = []
    skipped_count = 0
    for file_path in all_files:
        state_key = _normalize_state_key(file_path)
        if state_key in old_state:
            skipped_count += 1
            continue
        files_to_scan.append(file_path)
    console.print(
        f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}"
    )
    if not files_to_scan:
        console.print("No new files to scan. Use --rescan to force a full scan.")
        return
    records: list[LinkRecord] = []
    for file_path in files_to_scan:
        content = file_path.read_text(encoding="utf-8")
        records.extend(scanner.scan_content(file_path=file_path, content=content))
    new_state = set(old_state)
    for file_path in files_to_scan:
        new_state.add(_normalize_state_key(file_path))
    _save_state(STATE_FILE, new_state)
    if not records:
        console.print("No links found.")
        return
@@ -16,13 +16,19 @@ class LinkRecord:
@dataclass(frozen=True)
 class LinkCheckResult:
    original_url: str
-    status_code: Optional[int]
+    initial_status_code: Optional[int]
    final_status_code: Optional[int]
    final_url: Optional[str]
    redirected: bool
    error: Optional[str] = None
    @property
    def status_code(self) -> Optional[int]:
        # Backward-compatible alias used by reporting and --check filtering.
        return self.initial_status_code
    @property
    def should_report(self) -> bool:
        if self.error is not None:
            return True
-        return self.status_code != 200
+        return self.initial_status_code != 200