feat: add incremental file scan state with optional --rescan full run
This commit is contained in:
@@ -32,6 +32,7 @@ htmlcov/
|
|||||||
Thumbs.db
|
Thumbs.db
|
||||||
*.log
|
*.log
|
||||||
.codex
|
.codex
|
||||||
|
.mdlink-state.json
|
||||||
|
|
||||||
# IDE/editor settings
|
# IDE/editor settings
|
||||||
.idea/
|
.idea/
|
||||||
|
|||||||
@@ -38,9 +38,13 @@ mdlink .
|
|||||||
|
|
||||||
- `--timeout FLOAT`
|
- `--timeout FLOAT`
|
||||||
Per-request timeout in seconds (default: `10.0`).
|
Per-request timeout in seconds (default: `10.0`).
|
||||||
|
- `--rescan`
|
||||||
|
Discard `.mdlink-state.json` and run a full scan on all matching files.
|
||||||
- `--check CODE`
|
- `--check CODE`
|
||||||
Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
|
Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
|
||||||
|
|
||||||
|
By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs.
|
||||||
|
|
||||||
## Interactive Redirect Rewrite
|
## Interactive Redirect Rewrite
|
||||||
|
|
||||||
Step 1: When a Markdown link redirects, `mdlink` prompts:
|
Step 1: When a Markdown link redirects, `mdlink` prompts:
|
||||||
|
|||||||
+6
-3
@@ -29,12 +29,14 @@ class LinkChecker:
|
|||||||
def check(self, url: str) -> LinkCheckResult:
|
def check(self, url: str) -> LinkCheckResult:
|
||||||
try:
|
try:
|
||||||
response = self._client.get(url)
|
response = self._client.get(url)
|
||||||
original_status = response.history[0].status_code if response.history else response.status_code
|
initial_status = response.history[0].status_code if response.history else response.status_code
|
||||||
|
final_status = response.status_code
|
||||||
final_url = str(response.url)
|
final_url = str(response.url)
|
||||||
redirected = final_url != url
|
redirected = final_url != url
|
||||||
return LinkCheckResult(
|
return LinkCheckResult(
|
||||||
original_url=url,
|
original_url=url,
|
||||||
status_code=original_status,
|
initial_status_code=initial_status,
|
||||||
|
final_status_code=final_status,
|
||||||
final_url=final_url,
|
final_url=final_url,
|
||||||
redirected=redirected,
|
redirected=redirected,
|
||||||
error=None,
|
error=None,
|
||||||
@@ -42,7 +44,8 @@ class LinkChecker:
|
|||||||
except httpx.HTTPError as exc:
|
except httpx.HTTPError as exc:
|
||||||
return LinkCheckResult(
|
return LinkCheckResult(
|
||||||
original_url=url,
|
original_url=url,
|
||||||
status_code=None,
|
initial_status_code=None,
|
||||||
|
final_status_code=None,
|
||||||
final_url=None,
|
final_url=None,
|
||||||
redirected=False,
|
redirected=False,
|
||||||
error=str(exc),
|
error=str(exc),
|
||||||
|
|||||||
+91
-11
@@ -1,6 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -12,13 +13,24 @@ from .ast_editor import ASTMarkdownEditor
|
|||||||
from .checker import LinkChecker
|
from .checker import LinkChecker
|
||||||
from .models import LinkCheckResult, LinkRecord
|
from .models import LinkCheckResult, LinkRecord
|
||||||
from .scanner import MarkdownScanner
|
from .scanner import MarkdownScanner
|
||||||
from .utils import unique_preserve_order
|
from .utils import iter_markdown_files, unique_preserve_order
|
||||||
|
|
||||||
|
STATE_FILE = Path(".mdlink-state.json")
|
||||||
|
|
||||||
|
|
||||||
|
class RewriteAborted(Exception):
|
||||||
|
"""Raised when user aborts interactive rewrite prompts."""
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
|
parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.")
|
||||||
parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
|
parser.add_argument("path", type=Path, help="Directory or Markdown file to scan")
|
||||||
parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
|
parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds")
|
||||||
|
parser.add_argument(
|
||||||
|
"--rescan",
|
||||||
|
action="store_true",
|
||||||
|
help="Discard existing scan state and rescan all matching files.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--check",
|
"--check",
|
||||||
dest="check_codes",
|
dest="check_codes",
|
||||||
@@ -30,6 +42,28 @@ def parse_args() -> argparse.Namespace:
|
|||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_state_key(path: Path) -> str:
|
||||||
|
return str(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_state(path: Path) -> set[str]:
|
||||||
|
if not path.exists():
|
||||||
|
return set()
|
||||||
|
try:
|
||||||
|
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
except (OSError, json.JSONDecodeError):
|
||||||
|
return set()
|
||||||
|
files = payload.get("files")
|
||||||
|
if not isinstance(files, list):
|
||||||
|
return set()
|
||||||
|
return {item for item in files if isinstance(item, str)}
|
||||||
|
|
||||||
|
|
||||||
|
def _save_state(path: Path, scanned_files: set[str]) -> None:
|
||||||
|
payload = {"files": sorted(scanned_files)}
|
||||||
|
path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
|
def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
|
||||||
if check_codes:
|
if check_codes:
|
||||||
return result.status_code is not None and result.status_code in check_codes
|
return result.status_code is not None and result.status_code in check_codes
|
||||||
@@ -128,6 +162,14 @@ def _collect_https_candidates(
|
|||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
def _prompt_yes_no(console: Console, prompt: str) -> bool:
|
||||||
|
try:
|
||||||
|
answer = console.input(prompt).strip().lower()
|
||||||
|
except KeyboardInterrupt as exc:
|
||||||
|
raise RewriteAborted from exc
|
||||||
|
return answer == "y"
|
||||||
|
|
||||||
|
|
||||||
def _handle_rewrites(
|
def _handle_rewrites(
|
||||||
records: list[LinkRecord],
|
records: list[LinkRecord],
|
||||||
checks: dict[str, LinkCheckResult],
|
checks: dict[str, LinkCheckResult],
|
||||||
@@ -141,6 +183,7 @@ def _handle_rewrites(
|
|||||||
seen_pairs: set[tuple[Path, str, str]] = set()
|
seen_pairs: set[tuple[Path, str, str]] = set()
|
||||||
check_cache: dict[str, LinkCheckResult] = {}
|
check_cache: dict[str, LinkCheckResult] = {}
|
||||||
|
|
||||||
|
try:
|
||||||
if redirects:
|
if redirects:
|
||||||
console.print("\n[bold]Redirect replacements[/bold]")
|
console.print("\n[bold]Redirect replacements[/bold]")
|
||||||
|
|
||||||
@@ -157,16 +200,17 @@ def _handle_rewrites(
|
|||||||
|
|
||||||
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||||
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||||
answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower()
|
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
|
||||||
if answer != "y":
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
||||||
if verification.status_code != 200:
|
if verification.final_status_code != 200:
|
||||||
console.print(
|
console.print(
|
||||||
f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})"
|
"[red]Skip:[/red] New URL returned "
|
||||||
|
f"({verification.final_status_code or verification.error})."
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
console.print("[green]Done:[/green] New URL returned (200).")
|
||||||
replacements_by_file[record.file_path][record.url] = final_url
|
replacements_by_file[record.file_path][record.url] = final_url
|
||||||
|
|
||||||
https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes)
|
https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes)
|
||||||
@@ -177,7 +221,7 @@ def _handle_rewrites(
|
|||||||
if replacements_by_file[record.file_path].get(record.url):
|
if replacements_by_file[record.file_path].get(record.url):
|
||||||
continue
|
continue
|
||||||
https_check = _cached_check(checker=checker, cache=check_cache, url=https_url)
|
https_check = _cached_check(checker=checker, cache=check_cache, url=https_url)
|
||||||
if https_check.status_code != 200:
|
if https_check.final_status_code != 200:
|
||||||
continue
|
continue
|
||||||
final_url: Optional[str] = https_check.final_url or https_url
|
final_url: Optional[str] = https_check.final_url or https_url
|
||||||
if final_url == record.url:
|
if final_url == record.url:
|
||||||
@@ -190,17 +234,21 @@ def _handle_rewrites(
|
|||||||
|
|
||||||
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||||
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||||
answer = console.input("Replace HTTP URL with HTTPS variant? [y/N] ").strip().lower()
|
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
|
||||||
if answer != "y":
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
||||||
if verification.status_code != 200:
|
if verification.final_status_code != 200:
|
||||||
console.print(
|
console.print(
|
||||||
f"[red]Skip:[/red] HTTPS URL no longer valid ({verification.status_code or verification.error})"
|
"[red]Skip:[/red] New URL returned "
|
||||||
|
f"({verification.final_status_code or verification.error})."
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
console.print("[green]Done:[/green] New URL returned (200).")
|
||||||
replacements_by_file[record.file_path][record.url] = final_url
|
replacements_by_file[record.file_path][record.url] = final_url
|
||||||
|
except RewriteAborted:
|
||||||
|
console.print("\n[yellow]Aborted by user during rewrite prompts. Discarded pending changes.[/yellow]")
|
||||||
|
return
|
||||||
|
|
||||||
for file_path, replacements in replacements_by_file.items():
|
for file_path, replacements in replacements_by_file.items():
|
||||||
content = file_path.read_text(encoding="utf-8")
|
content = file_path.read_text(encoding="utf-8")
|
||||||
@@ -216,7 +264,39 @@ def main() -> None:
|
|||||||
check_codes = set(args.check_codes) if args.check_codes else None
|
check_codes = set(args.check_codes) if args.check_codes else None
|
||||||
|
|
||||||
scanner = MarkdownScanner()
|
scanner = MarkdownScanner()
|
||||||
records = scanner.scan_path(args.path)
|
all_files = list(iter_markdown_files(args.path))
|
||||||
|
if not all_files:
|
||||||
|
console.print("No Markdown files found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
old_state = set() if args.rescan else _load_state(STATE_FILE)
|
||||||
|
files_to_scan: list[Path] = []
|
||||||
|
skipped_count = 0
|
||||||
|
for file_path in all_files:
|
||||||
|
state_key = _normalize_state_key(file_path)
|
||||||
|
if state_key in old_state:
|
||||||
|
skipped_count += 1
|
||||||
|
continue
|
||||||
|
files_to_scan.append(file_path)
|
||||||
|
|
||||||
|
console.print(
|
||||||
|
f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not files_to_scan:
|
||||||
|
console.print("No new files to scan. Use --rescan to force a full scan.")
|
||||||
|
return
|
||||||
|
|
||||||
|
records: list[LinkRecord] = []
|
||||||
|
for file_path in files_to_scan:
|
||||||
|
content = file_path.read_text(encoding="utf-8")
|
||||||
|
records.extend(scanner.scan_content(file_path=file_path, content=content))
|
||||||
|
|
||||||
|
new_state = set(old_state)
|
||||||
|
for file_path in files_to_scan:
|
||||||
|
new_state.add(_normalize_state_key(file_path))
|
||||||
|
_save_state(STATE_FILE, new_state)
|
||||||
|
|
||||||
if not records:
|
if not records:
|
||||||
console.print("No links found.")
|
console.print("No links found.")
|
||||||
return
|
return
|
||||||
|
|||||||
+8
-2
@@ -16,13 +16,19 @@ class LinkRecord:
|
|||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class LinkCheckResult:
|
class LinkCheckResult:
|
||||||
original_url: str
|
original_url: str
|
||||||
status_code: Optional[int]
|
initial_status_code: Optional[int]
|
||||||
|
final_status_code: Optional[int]
|
||||||
final_url: Optional[str]
|
final_url: Optional[str]
|
||||||
redirected: bool
|
redirected: bool
|
||||||
error: Optional[str] = None
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def status_code(self) -> Optional[int]:
|
||||||
|
# Backward-compatible alias used by reporting and --check filtering.
|
||||||
|
return self.initial_status_code
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def should_report(self) -> bool:
|
def should_report(self) -> bool:
|
||||||
if self.error is not None:
|
if self.error is not None:
|
||||||
return True
|
return True
|
||||||
return self.status_code != 200
|
return self.initial_status_code != 200
|
||||||
|
|||||||
Reference in New Issue
Block a user