feat: add incremental file scan state with optional --rescan full run
This commit is contained in:
@@ -32,6 +32,7 @@ htmlcov/
|
||||
Thumbs.db
|
||||
*.log
|
||||
.codex
|
||||
.mdlink-state.json
|
||||
|
||||
# IDE/editor settings
|
||||
.idea/
|
||||
|
||||
@@ -38,9 +38,13 @@ mdlink .
|
||||
|
||||
- `--timeout FLOAT`
|
||||
Per-request timeout in seconds (default: `10.0`).
|
||||
- `--rescan`
|
||||
Discard `.mdlink-state.json` and run a full scan on all matching files.
|
||||
- `--check CODE`
|
||||
Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
|
||||
|
||||
By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs.
|
||||
|
||||
## Interactive Redirect Rewrite
|
||||
|
||||
Step 1: When a Markdown link redirects, `mdlink` prompts:
|
||||
|
||||
+6
-3
@@ -29,12 +29,14 @@ class LinkChecker:
|
||||
def check(self, url: str) -> LinkCheckResult:
|
||||
try:
|
||||
response = self._client.get(url)
|
||||
original_status = response.history[0].status_code if response.history else response.status_code
|
||||
initial_status = response.history[0].status_code if response.history else response.status_code
|
||||
final_status = response.status_code
|
||||
final_url = str(response.url)
|
||||
redirected = final_url != url
|
||||
return LinkCheckResult(
|
||||
original_url=url,
|
||||
status_code=original_status,
|
||||
initial_status_code=initial_status,
|
||||
final_status_code=final_status,
|
||||
final_url=final_url,
|
||||
redirected=redirected,
|
||||
error=None,
|
||||
@@ -42,7 +44,8 @@ class LinkChecker:
|
||||
except httpx.HTTPError as exc:
|
||||
return LinkCheckResult(
|
||||
original_url=url,
|
||||
status_code=None,
|
||||
initial_status_code=None,
|
||||
final_status_code=None,
|
||||
final_url=None,
|
||||
redirected=False,
|
||||
error=str(exc),
|
||||
|
||||
+134
-54
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
@@ -12,13 +13,24 @@ from .ast_editor import ASTMarkdownEditor
|
||||
from .checker import LinkChecker
|
||||
from .models import LinkCheckResult, LinkRecord
|
||||
from .scanner import MarkdownScanner
|
||||
from .utils import unique_preserve_order
|
||||
from .utils import iter_markdown_files, unique_preserve_order
|
||||
|
||||
STATE_FILE = Path(".mdlink-state.json")
|
||||
|
||||
|
||||
class RewriteAborted(Exception):
|
||||
"""Raised when user aborts interactive rewrite prompts."""
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
|
||||
parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
|
||||
parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
|
||||
parser.add_argument(
|
||||
"--rescan",
|
||||
action="store_true",
|
||||
help="Discard existing scan state and rescan all matching files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--check",
|
||||
dest="check_codes",
|
||||
@@ -30,6 +42,28 @@ def parse_args() -> argparse.Namespace:
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def _normalize_state_key(path: Path) -> str:
|
||||
return str(path)
|
||||
|
||||
|
||||
def _load_state(path: Path) -> set[str]:
|
||||
if not path.exists():
|
||||
return set()
|
||||
try:
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return set()
|
||||
files = payload.get("files")
|
||||
if not isinstance(files, list):
|
||||
return set()
|
||||
return {item for item in files if isinstance(item, str)}
|
||||
|
||||
|
||||
def _save_state(path: Path, scanned_files: set[str]) -> None:
|
||||
payload = {"files": sorted(scanned_files)}
|
||||
path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
|
||||
if check_codes:
|
||||
return result.status_code is not None and result.status_code in check_codes
|
||||
@@ -128,6 +162,14 @@ def _collect_https_candidates(
|
||||
return candidates
|
||||
|
||||
|
||||
def _prompt_yes_no(console: Console, prompt: str) -> bool:
|
||||
try:
|
||||
answer = console.input(prompt).strip().lower()
|
||||
except KeyboardInterrupt as exc:
|
||||
raise RewriteAborted from exc
|
||||
return answer == "y"
|
||||
|
||||
|
||||
def _handle_rewrites(
|
||||
records: list[LinkRecord],
|
||||
checks: dict[str, LinkCheckResult],
|
||||
@@ -141,66 +183,72 @@ def _handle_rewrites(
|
||||
seen_pairs: set[tuple[Path, str, str]] = set()
|
||||
check_cache: dict[str, LinkCheckResult] = {}
|
||||
|
||||
if redirects:
|
||||
console.print("\n[bold]Redirect replacements[/bold]")
|
||||
try:
|
||||
if redirects:
|
||||
console.print("\n[bold]Redirect replacements[/bold]")
|
||||
|
||||
for record, result in redirects:
|
||||
if record.kind != "markdown":
|
||||
continue
|
||||
final_url = result.final_url
|
||||
if not final_url:
|
||||
continue
|
||||
pair = (record.file_path, record.url, final_url)
|
||||
if pair in seen_pairs:
|
||||
continue
|
||||
seen_pairs.add(pair)
|
||||
for record, result in redirects:
|
||||
if record.kind != "markdown":
|
||||
continue
|
||||
final_url = result.final_url
|
||||
if not final_url:
|
||||
continue
|
||||
pair = (record.file_path, record.url, final_url)
|
||||
if pair in seen_pairs:
|
||||
continue
|
||||
seen_pairs.add(pair)
|
||||
|
||||
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||
answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower()
|
||||
if answer != "y":
|
||||
continue
|
||||
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
|
||||
continue
|
||||
|
||||
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
||||
if verification.status_code != 200:
|
||||
console.print(
|
||||
f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})"
|
||||
)
|
||||
continue
|
||||
replacements_by_file[record.file_path][record.url] = final_url
|
||||
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
||||
if verification.final_status_code != 200:
|
||||
console.print(
|
||||
"[red]Skip:[/red] New URL returned "
|
||||
f"({verification.final_status_code or verification.error})."
|
||||
)
|
||||
continue
|
||||
console.print("[green]Done:[/green] New URL returned (200).")
|
||||
replacements_by_file[record.file_path][record.url] = final_url
|
||||
|
||||
https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes)
|
||||
if https_candidates:
|
||||
console.print("\n[bold]HTTPS upgrade candidates[/bold]")
|
||||
https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes)
|
||||
if https_candidates:
|
||||
console.print("\n[bold]HTTPS upgrade candidates[/bold]")
|
||||
|
||||
for record, https_url in https_candidates:
|
||||
if replacements_by_file[record.file_path].get(record.url):
|
||||
continue
|
||||
https_check = _cached_check(checker=checker, cache=check_cache, url=https_url)
|
||||
if https_check.status_code != 200:
|
||||
continue
|
||||
final_url: Optional[str] = https_check.final_url or https_url
|
||||
if final_url == record.url:
|
||||
continue
|
||||
for record, https_url in https_candidates:
|
||||
if replacements_by_file[record.file_path].get(record.url):
|
||||
continue
|
||||
https_check = _cached_check(checker=checker, cache=check_cache, url=https_url)
|
||||
if https_check.final_status_code != 200:
|
||||
continue
|
||||
final_url: Optional[str] = https_check.final_url or https_url
|
||||
if final_url == record.url:
|
||||
continue
|
||||
|
||||
pair = (record.file_path, record.url, final_url)
|
||||
if pair in seen_pairs:
|
||||
continue
|
||||
seen_pairs.add(pair)
|
||||
pair = (record.file_path, record.url, final_url)
|
||||
if pair in seen_pairs:
|
||||
continue
|
||||
seen_pairs.add(pair)
|
||||
|
||||
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||
answer = console.input("Replace HTTP URL with HTTPS variant? [y/N] ").strip().lower()
|
||||
if answer != "y":
|
||||
continue
|
||||
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
|
||||
continue
|
||||
|
||||
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
||||
if verification.status_code != 200:
|
||||
console.print(
|
||||
f"[red]Skip:[/red] HTTPS URL no longer valid ({verification.status_code or verification.error})"
|
||||
)
|
||||
continue
|
||||
replacements_by_file[record.file_path][record.url] = final_url
|
||||
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
||||
if verification.final_status_code != 200:
|
||||
console.print(
|
||||
"[red]Skip:[/red] New URL returned "
|
||||
f"({verification.final_status_code or verification.error})."
|
||||
)
|
||||
continue
|
||||
console.print("[green]Done:[/green] New URL returned (200).")
|
||||
replacements_by_file[record.file_path][record.url] = final_url
|
||||
except RewriteAborted:
|
||||
console.print("\n[yellow]Aborted by user during rewrite prompts. Discarded pending changes.[/yellow]")
|
||||
return
|
||||
|
||||
for file_path, replacements in replacements_by_file.items():
|
||||
content = file_path.read_text(encoding="utf-8")
|
||||
@@ -216,7 +264,39 @@ def main() -> None:
|
||||
check_codes = set(args.check_codes) if args.check_codes else None
|
||||
|
||||
scanner = MarkdownScanner()
|
||||
records = scanner.scan_path(args.path)
|
||||
all_files = list(iter_markdown_files(args.path))
|
||||
if not all_files:
|
||||
console.print("No Markdown files found.")
|
||||
return
|
||||
|
||||
old_state = set() if args.rescan else _load_state(STATE_FILE)
|
||||
files_to_scan: list[Path] = []
|
||||
skipped_count = 0
|
||||
for file_path in all_files:
|
||||
state_key = _normalize_state_key(file_path)
|
||||
if state_key in old_state:
|
||||
skipped_count += 1
|
||||
continue
|
||||
files_to_scan.append(file_path)
|
||||
|
||||
console.print(
|
||||
f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}"
|
||||
)
|
||||
|
||||
if not files_to_scan:
|
||||
console.print("No new files to scan. Use --rescan to force a full scan.")
|
||||
return
|
||||
|
||||
records: list[LinkRecord] = []
|
||||
for file_path in files_to_scan:
|
||||
content = file_path.read_text(encoding="utf-8")
|
||||
records.extend(scanner.scan_content(file_path=file_path, content=content))
|
||||
|
||||
new_state = set(old_state)
|
||||
for file_path in files_to_scan:
|
||||
new_state.add(_normalize_state_key(file_path))
|
||||
_save_state(STATE_FILE, new_state)
|
||||
|
||||
if not records:
|
||||
console.print("No links found.")
|
||||
return
|
||||
|
||||
+8
-2
@@ -16,13 +16,19 @@ class LinkRecord:
|
||||
@dataclass(frozen=True)
|
||||
class LinkCheckResult:
|
||||
original_url: str
|
||||
status_code: Optional[int]
|
||||
initial_status_code: Optional[int]
|
||||
final_status_code: Optional[int]
|
||||
final_url: Optional[str]
|
||||
redirected: bool
|
||||
error: Optional[str] = None
|
||||
|
||||
@property
|
||||
def status_code(self) -> Optional[int]:
|
||||
# Backward-compatible alias used by reporting and --check filtering.
|
||||
return self.initial_status_code
|
||||
|
||||
@property
|
||||
def should_report(self) -> bool:
|
||||
if self.error is not None:
|
||||
return True
|
||||
return self.status_code != 200
|
||||
return self.initial_status_code != 200
|
||||
|
||||
Reference in New Issue
Block a user