diff --git a/.gitignore b/.gitignore index 1e06c28..c1b971e 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ Thumbs.db # Local test artifacts test_preview.md +blog/ diff --git a/README.md b/README.md index f34e97d..5334969 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,8 @@ mdlink . - `--timeout FLOAT` Per-request timeout in seconds (default: `10.0`). +- `--check CODE` + Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`). ## Interactive Redirect Rewrite diff --git a/mdlink/checker.py b/mdlink/checker.py index 0af97f9..1a1b3fc 100644 --- a/mdlink/checker.py +++ b/mdlink/checker.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable +from typing import Callable, Iterable, Optional import httpx @@ -29,11 +29,12 @@ class LinkChecker: def check(self, url: str) -> LinkCheckResult: try: response = self._client.get(url) + original_status = response.history[0].status_code if response.history else response.status_code final_url = str(response.url) redirected = final_url != url return LinkCheckResult( original_url=url, - status_code=response.status_code, + status_code=original_status, final_url=final_url, redirected=redirected, error=None, @@ -47,8 +48,16 @@ class LinkChecker: error=str(exc), ) - def check_many(self, urls: Iterable[str]) -> dict[str, LinkCheckResult]: + def check_many( + self, + urls: Iterable[str], + progress_callback: Optional[Callable[[int, int, str], None]] = None, + ) -> dict[str, LinkCheckResult]: results: dict[str, LinkCheckResult] = {} - for url in unique_preserve_order(urls): + unique_urls = unique_preserve_order(urls) + total = len(unique_urls) + for index, url in enumerate(unique_urls, start=1): results[url] = self.check(url) + if progress_callback is not None: + progress_callback(index, total, url) return results diff --git a/mdlink/cli.py b/mdlink/cli.py index 267aace..ecf3fa7 100644 --- a/mdlink/cli.py +++ b/mdlink/cli.py @@ -19,10 +19,28 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(prog="mdlink", description="Scan Markdown files and validate links.") parser.add_argument("path", type=Path, help="Directory or Markdown file to scan") parser.add_argument("--timeout", type=float, default=10.0, help="Request timeout in seconds") + parser.add_argument( + "--check", + dest="check_codes", + type=int, + action="append", + metavar="CODE", + help="Report only selected HTTP status codes. Repeat for multiple values (e.g. --check 404 --check 301).", + ) return parser.parse_args() -def _build_report_table(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> Table: +def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool: + if check_codes: + return result.status_code is not None and result.status_code in check_codes + return result.should_report + + +def _build_report_table( + records: list[LinkRecord], + checks: dict[str, LinkCheckResult], + check_codes: Optional[set[int]] = None, +) -> Table: table = Table(title="Non-200 Links") table.add_column("file") table.add_column("line", justify="right") @@ -32,7 +50,7 @@ def _build_report_table(records: list[LinkRecord], checks: dict[str, LinkCheckRe for record in records: result = checks[record.url] - if not result.should_report: + if not _is_listed_result(result=result, check_codes=check_codes): continue status_value = str(result.status_code) if result.status_code is not None else f"ERR: {result.error}" table.add_row( @@ -45,10 +63,16 @@ def _build_report_table(records: list[LinkRecord], checks: dict[str, LinkCheckRe return table -def _collect_redirects(records: list[LinkRecord], checks: dict[str, LinkCheckResult]) -> list[tuple[LinkRecord, LinkCheckResult]]: +def _collect_redirects( + records: list[LinkRecord], + checks: dict[str, LinkCheckResult], + check_codes: Optional[set[int]] = None, +) -> list[tuple[LinkRecord, LinkCheckResult]]: redirects: list[tuple[LinkRecord, LinkCheckResult]] = [] for record in records: result = checks[record.url] + if not _is_listed_result(result=result, check_codes=check_codes): + continue if not result.redirected: continue if not result.final_url: @@ -80,6 +104,7 @@ def _cached_check( def _collect_https_candidates( records: list[LinkRecord], checks: dict[str, LinkCheckResult], + check_codes: Optional[set[int]] = None, ) -> list[tuple[LinkRecord, str]]: candidates: list[tuple[LinkRecord, str]] = [] seen: set[tuple[Path, str]] = set() @@ -89,6 +114,10 @@ def _collect_https_candidates( if not _is_http_url(record.url): continue original_check = checks.get(record.url) + if original_check is None: + continue + if not _is_listed_result(result=original_check, check_codes=check_codes): + continue if original_check and original_check.redirected: continue key = (record.file_path, record.url) @@ -102,6 +131,7 @@ def _collect_https_candidates( def _handle_rewrites( records: list[LinkRecord], checks: dict[str, LinkCheckResult], + check_codes: Optional[set[int]], redirects: list[tuple[LinkRecord, LinkCheckResult]], checker: LinkChecker, editor: ASTMarkdownEditor, @@ -139,7 +169,7 @@ def _handle_rewrites( continue replacements_by_file[record.file_path][record.url] = final_url - https_candidates = _collect_https_candidates(records=records, checks=checks) + https_candidates = _collect_https_candidates(records=records, checks=checks, check_codes=check_codes) if https_candidates: console.print("\n[bold]HTTPS upgrade candidates[/bold]") @@ -183,6 +213,7 @@ def _handle_rewrites( def main() -> None: args = parse_args() console = Console() + check_codes = set(args.check_codes) if args.check_codes else None scanner = MarkdownScanner() records = scanner.scan_path(args.path) @@ -192,18 +223,36 @@ def main() -> None: urls = unique_preserve_order(record.url for record in records) with LinkChecker(timeout=args.timeout) as checker: - checks = checker.check_many(urls) - table = _build_report_table(records, checks) + total_urls = len(urls) + console.print(f"Checking {total_urls} unique URLs...") + + def progress(current: int, total: int, url: str) -> None: + _ = url + if current == 1 or current % 25 == 0 or current == total: + console.print(f"[dim]Progress: {current}/{total}[/dim]") + + try: + checks = checker.check_many(urls, progress_callback=progress) + except KeyboardInterrupt: + console.print("\n[yellow]Aborted by user during link checks.[/yellow]") + return + + table = _build_report_table(records, checks, check_codes=check_codes) if table.row_count: console.print(table) else: - console.print("No non-200 links found.") + if check_codes: + sorted_codes = ", ".join(str(code) for code in sorted(check_codes)) + console.print(f"No links found with status code(s): {sorted_codes}.") + else: + console.print("No non-200 links found.") - redirects = _collect_redirects(records, checks) + redirects = _collect_redirects(records, checks, check_codes=check_codes) editor = ASTMarkdownEditor() _handle_rewrites( records=records, checks=checks, + check_codes=check_codes, redirects=redirects, checker=checker, editor=editor,