feat: add incremental file scan state with optional --rescan full run

This commit is contained in:
2026-04-17 19:51:07 +02:00
parent c8d4128b79
commit 1583871cf9
5 changed files with 153 additions and 59 deletions
+1
View File
@@ -32,6 +32,7 @@ htmlcov/
Thumbs.db Thumbs.db
*.log *.log
.codex .codex
.mdlink-state.json
# IDE/editor settings # IDE/editor settings
.idea/ .idea/
+4
View File
@@ -38,9 +38,13 @@ mdlink .
- `--timeout FLOAT` - `--timeout FLOAT`
Per-request timeout in seconds (default: `10.0`). Per-request timeout in seconds (default: `10.0`).
- `--rescan`
Discard `.mdlink-state.json` and run a full scan on all matching files.
- `--check CODE` - `--check CODE`
Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`). Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs.
## Interactive Redirect Rewrite ## Interactive Redirect Rewrite
Step 1: When a Markdown link redirects, `mdlink` prompts: Step 1: When a Markdown link redirects, `mdlink` prompts:
+6 -3
View File
@@ -29,12 +29,14 @@ class LinkChecker:
def check(self, url: str) -> LinkCheckResult: def check(self, url: str) -> LinkCheckResult:
try: try:
response = self._client.get(url) response = self._client.get(url)
original_status = response.history[0].status_code if response.history else response.status_code initial_status = response.history[0].status_code if response.history else response.status_code
final_status = response.status_code
final_url = str(response.url) final_url = str(response.url)
redirected = final_url != url redirected = final_url != url
return LinkCheckResult( return LinkCheckResult(
original_url=url, original_url=url,
status_code=original_status, initial_status_code=initial_status,
final_status_code=final_status,
final_url=final_url, final_url=final_url,
redirected=redirected, redirected=redirected,
error=None, error=None,
@@ -42,7 +44,8 @@ class LinkChecker:
except httpx.HTTPError as exc: except httpx.HTTPError as exc:
return LinkCheckResult( return LinkCheckResult(
original_url=url, original_url=url,
status_code=None, initial_status_code=None,
final_status_code=None,
final_url=None, final_url=None,
redirected=False, redirected=False,
error=str(exc), error=str(exc),
+91 -11
View File
@@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import json
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@@ -12,13 +13,24 @@ from .ast_editor import ASTMarkdownEditor
from .checker import LinkChecker from .checker import LinkChecker
from .models import LinkCheckResult, LinkRecord from .models import LinkCheckResult, LinkRecord
from .scanner import MarkdownScanner from .scanner import MarkdownScanner
from .utils import unique_preserve_order from .utils import iter_markdown_files, unique_preserve_order
STATE_FILE = Path(".mdlink-state.json")
class RewriteAborted(Exception):
"""Raised when user aborts interactive rewrite prompts."""
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.") parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
parser.add_argument("path", type=Path, help="Directory or Markdown file to scan") parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds") parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
parser.add_argument(
"--rescan",
action="store_true",
help="Discard existing scan state and rescan all matching files.",
)
parser.add_argument( parser.add_argument(
"--check", "--check",
dest="check_codes", dest="check_codes",
@@ -30,6 +42,28 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args() return parser.parse_args()
def _normalize_state_key(path: Path) -> str:
return str(path)
def _load_state(path: Path) -> set[str]:
if not path.exists():
return set()
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
return set()
files = payload.get("files")
if not isinstance(files, list):
return set()
return {item for item in files if isinstance(item, str)}
def _save_state(path: Path, scanned_files: set[str]) -> None:
payload = {"files": sorted(scanned_files)}
path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool: def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
if check_codes: if check_codes:
return result.status_code is not None and result.status_code in check_codes return result.status_code is not None and result.status_code in check_codes
@@ -128,6 +162,14 @@ def _collect_https_candidates(
return candidates return candidates
def _prompt_yes_no(console: Console, prompt: str) -> bool:
try:
answer = console.input(prompt).strip().lower()
except KeyboardInterrupt as exc:
raise RewriteAborted from exc
return answer == "y"
def _handle_rewrites( def _handle_rewrites(
records: list[LinkRecord], records: list[LinkRecord],
checks: dict[str, LinkCheckResult], checks: dict[str, LinkCheckResult],
@@ -141,6 +183,7 @@ def _handle_rewrites(
seen_pairs: set[tuple[Path, str, str]] = set() seen_pairs: set[tuple[Path, str, str]] = set()
check_cache: dict[str, LinkCheckResult] = {} check_cache: dict[str, LinkCheckResult] = {}
try:
if redirects: if redirects:
console.print("\n[bold]Redirect replacements[/bold]") console.print("\n[bold]Redirect replacements[/bold]")
@@ -157,16 +200,17 @@ def _handle_rewrites(
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower() if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
if answer != "y":
continue continue
verification = _cached_check(checker=checker, cache=check_cache, url=final_url) verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
if verification.status_code != 200: if verification.final_status_code != 200:
console.print( console.print(
f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})" "[red]Skip:[/red] New URL returned "
f"({verification.final_status_code or verification.error})."
) )
continue continue
console.print("[green]Done:[/green] New URL returned (200).")
replacements_by_file[record.file_path][record.url] = final_url replacements_by_file[record.file_path][record.url] = final_url
https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes) https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes)
@@ -177,7 +221,7 @@ def _handle_rewrites(
if replacements_by_file[record.file_path].get(record.url): if replacements_by_file[record.file_path].get(record.url):
continue continue
https_check = _cached_check(checker=checker, cache=check_cache, url=https_url) https_check = _cached_check(checker=checker, cache=check_cache, url=https_url)
if https_check.status_code != 200: if https_check.final_status_code != 200:
continue continue
final_url: Optional[str] = https_check.final_url or https_url final_url: Optional[str] = https_check.final_url or https_url
if final_url == record.url: if final_url == record.url:
@@ -190,17 +234,21 @@ def _handle_rewrites(
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
answer = console.input("Replace HTTP URL with HTTPS variant? [y/N] ").strip().lower() if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
if answer != "y":
continue continue
verification = _cached_check(checker=checker, cache=check_cache, url=final_url) verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
if verification.status_code != 200: if verification.final_status_code != 200:
console.print( console.print(
f"[red]Skip:[/red] HTTPS URL no longer valid ({verification.status_code or verification.error})" "[red]Skip:[/red] New URL returned "
f"({verification.final_status_code or verification.error})."
) )
continue continue
console.print("[green]Done:[/green] New URL returned (200).")
replacements_by_file[record.file_path][record.url] = final_url replacements_by_file[record.file_path][record.url] = final_url
except RewriteAborted:
console.print("\n[yellow]Aborted by user during rewrite prompts. Discarded pending changes.[/yellow]")
return
for file_path, replacements in replacements_by_file.items(): for file_path, replacements in replacements_by_file.items():
content = file_path.read_text(encoding="utf-8") content = file_path.read_text(encoding="utf-8")
@@ -216,7 +264,39 @@ def main() -> None:
check_codes = set(args.check_codes) if args.check_codes else None check_codes = set(args.check_codes) if args.check_codes else None
scanner = MarkdownScanner() scanner = MarkdownScanner()
records = scanner.scan_path(args.path) all_files = list(iter_markdown_files(args.path))
if not all_files:
console.print("No Markdown files found.")
return
old_state = set() if args.rescan else _load_state(STATE_FILE)
files_to_scan: list[Path] = []
skipped_count = 0
for file_path in all_files:
state_key = _normalize_state_key(file_path)
if state_key in old_state:
skipped_count += 1
continue
files_to_scan.append(file_path)
console.print(
f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}"
)
if not files_to_scan:
console.print("No new files to scan. Use --rescan to force a full scan.")
return
records: list[LinkRecord] = []
for file_path in files_to_scan:
content = file_path.read_text(encoding="utf-8")
records.extend(scanner.scan_content(file_path=file_path, content=content))
new_state = set(old_state)
for file_path in files_to_scan:
new_state.add(_normalize_state_key(file_path))
_save_state(STATE_FILE, new_state)
if not records: if not records:
console.print("No links found.") console.print("No links found.")
return return
+8 -2
View File
@@ -16,13 +16,19 @@ class LinkRecord:
@dataclass(frozen=True) @dataclass(frozen=True)
class LinkCheckResult: class LinkCheckResult:
original_url: str original_url: str
status_code: Optional[int] initial_status_code: Optional[int]
final_status_code: Optional[int]
final_url: Optional[str] final_url: Optional[str]
redirected: bool redirected: bool
error: Optional[str] = None error: Optional[str] = None
@property
def status_code(self) -> Optional[int]:
# Backward-compatible alias used by reporting and --check filtering.
return self.initial_status_code
@property @property
def should_report(self) -> bool: def should_report(self) -> bool:
if self.error is not None: if self.error is not None:
return True return True
return self.status_code != 200 return self.initial_status_code != 200