diff --git a/.gitignore b/.gitignore index c1b971e..2ddf218 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ htmlcov/ Thumbs.db *.log .codex +.mdlink-state.json # IDE/editor settings .idea/ diff --git a/README.md b/README.md index 5334969..055e948 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,13 @@ mdlink . - `--timeout FLOAT` Per-request timeout in seconds (default: `10.0`). +- `--rescan` + Discard `.mdlink-state.json` and run a full scan on all matching files. - `--check CODE` Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`). +By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs. + ## Interactive Redirect Rewrite Step 1: When a Markdown link redirects, `mdlink` prompts: diff --git a/mdlink/checker.py b/mdlink/checker.py index 1a1b3fc..64c473c 100644 --- a/mdlink/checker.py +++ b/mdlink/checker.py @@ -29,12 +29,14 @@ class LinkChecker: def check(self, url: str) -> LinkCheckResult: try: response = self._client.get(url) - original_status = response.history[0].status_code if response.history else response.status_code + initial_status = response.history[0].status_code if response.history else response.status_code + final_status = response.status_code final_url = str(response.url) redirected = final_url != url return LinkCheckResult( original_url=url, - status_code=original_status, + initial_status_code=initial_status, + final_status_code=final_status, final_url=final_url, redirected=redirected, error=None, @@ -42,7 +44,8 @@ class LinkChecker: except httpx.HTTPError as exc: return LinkCheckResult( original_url=url, - status_code=None, + initial_status_code=None, + final_status_code=None, final_url=None, redirected=False, error=str(exc), diff --git a/mdlink/cli.py b/mdlink/cli.py index ecf3fa7..9fbf958 100644 --- a/mdlink/cli.py +++ b/mdlink/cli.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +import json from collections import defaultdict from pathlib import Path from typing import Optional @@ -12,13 +13,24 @@ from .ast_editor import ASTMarkdownEditor from .checker import LinkChecker from .models import LinkCheckResult, LinkRecord from .scanner import MarkdownScanner -from .utils import unique_preserve_order +from .utils import iter_markdown_files, unique_preserve_order + +STATE_FILE = Path(".mdlink-state.json") + + +class RewriteAborted(Exception): + """Raised when user aborts interactive rewrite prompts.""" def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.") parser.add_argument("path", type=Path, help="Directory or Markdown file to scan") parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds") + parser.add_argument( + "--rescan", + action="store_true", + help="Discard existing scan state and rescan all matching files.", + ) parser.add_argument( "--check", dest="check_codes", @@ -30,6 +42,28 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() +def _normalize_state_key(path: Path) -> str: + return str(path) + + +def _load_state(path: Path) -> set[str]: + if not path.exists(): + return set() + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return set() + files = payload.get("files") + if not isinstance(files, list): + return set() + return {item for item in files if isinstance(item, str)} + + +def _save_state(path: Path, scanned_files: set[str]) -> None: + payload = {"files": sorted(scanned_files)} + path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8") + + def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool: if check_codes: return result.status_code is not None and result.status_code in check_codes @@ -128,6 +162,14 @@ def _collect_https_candidates( return candidates +def _prompt_yes_no(console: Console, prompt: str) -> bool: + try: + answer = console.input(prompt).strip().lower() + except KeyboardInterrupt as exc: + raise RewriteAborted from exc + return answer == "y" + + def _handle_rewrites( records: list[LinkRecord], checks: dict[str, LinkCheckResult], @@ -141,66 +183,72 @@ def _handle_rewrites( seen_pairs: set[tuple[Path, str, str]] = set() check_cache: dict[str, LinkCheckResult] = {} - if redirects: - console.print("\n[bold]Redirect replacements[/bold]") + try: + if redirects: + console.print("\n[bold]Redirect replacements[/bold]") - for record, result in redirects: - if record.kind != "markdown": - continue - final_url = result.final_url - if not final_url: - continue - pair = (record.file_path, record.url, final_url) - if pair in seen_pairs: - continue - seen_pairs.add(pair) + for record, result in redirects: + if record.kind != "markdown": + continue + final_url = result.final_url + if not final_url: + continue + pair = (record.file_path, record.url, final_url) + if pair in seen_pairs: + continue + seen_pairs.add(pair) - console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") - console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") - answer = console.input("Replace old URL with final URL? [y/N] ").strip().lower() - if answer != "y": - continue + console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") + console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") + if not _prompt_yes_no(console, "Replace old URL? [y/N] "): + continue - verification = _cached_check(checker=checker, cache=check_cache, url=final_url) - if verification.status_code != 200: - console.print( - f"[red]Skip:[/red] final URL no longer valid ({verification.status_code or verification.error})" - ) - continue - replacements_by_file[record.file_path][record.url] = final_url + verification = _cached_check(checker=checker, cache=check_cache, url=final_url) + if verification.final_status_code != 200: + console.print( + "[red]Skip:[/red] New URL returned " + f"({verification.final_status_code or verification.error})." + ) + continue + console.print("[green]Done:[/green] New URL returned (200).") + replacements_by_file[record.file_path][record.url] = final_url - https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes) - if https_candidates: - console.print("\n[bold]HTTPS upgrade candidates[/bold]") + https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes) + if https_candidates: + console.print("\n[bold]HTTPS upgrade candidates[/bold]") - for record, https_url in https_candidates: - if replacements_by_file[record.file_path].get(record.url): - continue - https_check = _cached_check(checker=checker, cache=check_cache, url=https_url) - if https_check.status_code != 200: - continue - final_url: Optional[str] = https_check.final_url or https_url - if final_url == record.url: - continue + for record, https_url in https_candidates: + if replacements_by_file[record.file_path].get(record.url): + continue + https_check = _cached_check(checker=checker, cache=check_cache, url=https_url) + if https_check.final_status_code != 200: + continue + final_url: Optional[str] = https_check.final_url or https_url + if final_url == record.url: + continue - pair = (record.file_path, record.url, final_url) - if pair in seen_pairs: - continue - seen_pairs.add(pair) + pair = (record.file_path, record.url, final_url) + if pair in seen_pairs: + continue + seen_pairs.add(pair) - console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") - console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") - answer = console.input("Replace HTTP URL with HTTPS variant? [y/N] ").strip().lower() - if answer != "y": - continue + console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") + console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") + if not _prompt_yes_no(console, "Replace old URL? [y/N] "): + continue - verification = _cached_check(checker=checker, cache=check_cache, url=final_url) - if verification.status_code != 200: - console.print( - f"[red]Skip:[/red] HTTPS URL no longer valid ({verification.status_code or verification.error})" - ) - continue - replacements_by_file[record.file_path][record.url] = final_url + verification = _cached_check(checker=checker, cache=check_cache, url=final_url) + if verification.final_status_code != 200: + console.print( + "[red]Skip:[/red] New URL returned " + f"({verification.final_status_code or verification.error})." + ) + continue + console.print("[green]Done:[/green] New URL returned (200).") + replacements_by_file[record.file_path][record.url] = final_url + except RewriteAborted: + console.print("\n[yellow]Aborted by user during rewrite prompts. Discarded pending changes.[/yellow]") + return for file_path, replacements in replacements_by_file.items(): content = file_path.read_text(encoding="utf-8") @@ -216,7 +264,39 @@ def main() -> None: check_codes = set(args.check_codes) if args.check_codes else None scanner = MarkdownScanner() - records = scanner.scan_path(args.path) + all_files = list(iter_markdown_files(args.path)) + if not all_files: + console.print("No Markdown files found.") + return + + old_state = set() if args.rescan else _load_state(STATE_FILE) + files_to_scan: list[Path] = [] + skipped_count = 0 + for file_path in all_files: + state_key = _normalize_state_key(file_path) + if state_key in old_state: + skipped_count += 1 + continue + files_to_scan.append(file_path) + + console.print( + f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}" + ) + + if not files_to_scan: + console.print("No new files to scan. Use --rescan to force a full scan.") + return + + records: list[LinkRecord] = [] + for file_path in files_to_scan: + content = file_path.read_text(encoding="utf-8") + records.extend(scanner.scan_content(file_path=file_path, content=content)) + + new_state = set(old_state) + for file_path in files_to_scan: + new_state.add(_normalize_state_key(file_path)) + _save_state(STATE_FILE, new_state) + if not records: console.print("No links found.") return diff --git a/mdlink/models.py b/mdlink/models.py index dac484b..b834f4a 100644 --- a/mdlink/models.py +++ b/mdlink/models.py @@ -16,13 +16,19 @@ class LinkRecord: @dataclass(frozen=True) class LinkCheckResult: original_url: str - status_code: Optional[int] + initial_status_code: Optional[int] + final_status_code: Optional[int] final_url: Optional[str] redirected: bool error: Optional[str] = None + @property + def status_code(self) -> Optional[int]: + # Backward-compatible alias used by reporting and --check filtering. + return self.initial_status_code + @property def should_report(self) -> bool: if self.error is not None: return True - return self.status_code != 200 + return self.initial_status_code != 200