From 86d7dc64d41255ff33af9924f914d44604ab7346 Mon Sep 17 00:00:00 2001 From: drg Date: Fri, 17 Apr 2026 21:30:26 +0200 Subject: [PATCH] feat: add persistent scan_index/url_policy state with --rescan and --reset-url-policy --- README.md | 29 +++++--- mdlink/cli.py | 160 ++++++++++++++++++++++++++++++++++------ test.md | 22 ------ test.sh | 108 ++++++++++++++++++++++++--- testdata/one.markdown | 7 ++ testdata/three.markdown | 7 ++ testdata/two.markdown | 8 ++ 7 files changed, 275 insertions(+), 66 deletions(-) delete mode 100644 test.md create mode 100644 testdata/one.markdown create mode 100644 testdata/three.markdown create mode 100644 testdata/two.markdown diff --git a/README.md b/README.md index 055e948..3ef6105 100644 --- a/README.md +++ b/README.md @@ -39,24 +39,31 @@ mdlink . - `--timeout FLOAT` Per-request timeout in seconds (default: `10.0`). - `--rescan` - Discard `.mdlink-state.json` and run a full scan on all matching files. + Reset only `scan_index` in `.mdlink-state.json` and run a full scan on all matching files. +- `--reset-url-policy` + Clear all stored URL ignore policies before scanning. - `--check CODE` Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`). -By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs. +By default, `mdlink` stores: + +- `scan_index`: already scanned files for incremental directory scans. +- `url_policy`: ignored URL decisions per file. + +Single-file scans do not use state. ## Interactive Redirect Rewrite Step 1: When a Markdown link redirects, `mdlink` prompts: ```text -Replace old URL with final URL? [y/N] +Replace old URL? [y/N] ``` Step 2: For `http://` Markdown links without redirect, `mdlink` can test an `https://` variant and prompt: ```text -Replace HTTP URL with HTTPS variant? [y/N] +Replace old URL? [y/N] ``` Only confirmed links are updated. @@ -65,25 +72,25 @@ Only confirmed links are updated. This repository includes: -- `test.sh`: Generates a sample Markdown test file. -- `test.md`: A sample file with mixed link cases (200, redirect, 404/500, naked URLs, ignored image/non-http links). +- `test.sh`: Generates three sample Markdown files in `testdata/` and runs state-related checks. +- `testdata/*.markdown`: Sample files with mixed link cases (200, redirect, 404/500, naked URLs, ignored image/non-http links). -Generate or overwrite `test.md`: +Generate test data and run checks: ```bash ./test.sh ``` -Generate to a custom file path: +Generate test data only: ```bash -./test.sh /tmp/my-test.md +./test.sh --generate-only ``` -Run the tool against the generated test file: +Run the tool against generated test data: ```bash -mdlink test.md +mdlink testdata ``` ## Help diff --git a/mdlink/cli.py b/mdlink/cli.py index 9fbf958..590e185 100644 --- a/mdlink/cli.py +++ b/mdlink/cli.py @@ -3,6 +3,7 @@ from __future__ import annotations import argparse import json from collections import defaultdict +from datetime import datetime, timezone from pathlib import Path from typing import Optional @@ -29,7 +30,12 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--rescan", action="store_true", - help="Discard existing scan state and rescan all matching files.", + help="Discard scan_index state and rescan all matching files.", + ) + parser.add_argument( + "--reset-url-policy", + action="store_true", + help="Clear all stored URL ignore policies before scanning.", ) parser.add_argument( "--check", @@ -46,24 +52,87 @@ def _normalize_state_key(path: Path) -> str: return str(path) -def _load_state(path: Path) -> set[str]: +def _load_state(path: Path) -> dict: if not path.exists(): - return set() + return {"version": 1, "scan_index": {}, "url_policy": {}} try: payload = json.loads(path.read_text(encoding="utf-8")) except (OSError, json.JSONDecodeError): - return set() - files = payload.get("files") - if not isinstance(files, list): - return set() - return {item for item in files if isinstance(item, str)} + return {"version": 1, "scan_index": {}, "url_policy": {}} + + scan_index = payload.get("scan_index") + if not isinstance(scan_index, dict): + scan_index = {} + url_policy = payload.get("url_policy") + if not isinstance(url_policy, dict): + url_policy = {} + return {"version": 1, "scan_index": scan_index, "url_policy": url_policy} -def _save_state(path: Path, scanned_files: set[str]) -> None: - payload = {"files": sorted(scanned_files)} +def _save_state(path: Path, state: dict) -> None: + payload = { + "version": 1, + "scan_index": state.get("scan_index", {}), + "url_policy": state.get("url_policy", {}), + } path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8") +def _now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def _iter_ignore_entries(entries: object) -> list[dict]: + if not isinstance(entries, list): + return [] + result: list[dict] = [] + for item in entries: + if isinstance(item, dict) and item.get("action") == "ignore": + result.append(item) + return result + + +def _ignored_sources_by_file(url_policy: dict[str, object]) -> dict[str, set[str]]: + ignored: dict[str, set[str]] = {} + for file_key, entries in url_policy.items(): + if not isinstance(file_key, str): + continue + sources: set[str] = set() + for entry in _iter_ignore_entries(entries): + source = entry.get("source") + if isinstance(source, str) and source: + sources.add(source) + if sources: + ignored[file_key] = sources + return ignored + + +def _is_ignored_pair(url_policy: dict[str, object], file_key: str, source: str, target: str) -> bool: + entries = _iter_ignore_entries(url_policy.get(file_key)) + for entry in entries: + if entry.get("source") == source and entry.get("target") == target: + return True + return False + + +def _remember_ignore(url_policy: dict[str, object], file_key: str, source: str, target: str) -> None: + entries = _iter_ignore_entries(url_policy.get(file_key)) + for entry in entries: + if entry.get("source") == source and entry.get("target") == target: + entry["seen_at"] = _now_iso() + url_policy[file_key] = entries + return + entries.append( + { + "action": "ignore", + "source": source, + "target": target, + "seen_at": _now_iso(), + } + ) + url_policy[file_key] = entries + + def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool: if check_codes: return result.status_code is not None and result.status_code in check_codes @@ -175,6 +244,7 @@ def _handle_rewrites( checks: dict[str, LinkCheckResult], check_codes: Optional[set[int]], redirects: list[tuple[LinkRecord, LinkCheckResult]], + url_policy: dict[str, object], checker: LinkChecker, editor: ASTMarkdownEditor, console: Console, @@ -193,6 +263,9 @@ def _handle_rewrites( final_url = result.final_url if not final_url: continue + file_key = _normalize_state_key(record.file_path) + if _is_ignored_pair(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url): + continue pair = (record.file_path, record.url, final_url) if pair in seen_pairs: continue @@ -201,6 +274,7 @@ def _handle_rewrites( console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") if not _prompt_yes_no(console, "Replace old URL? [y/N] "): + _remember_ignore(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url) continue verification = _cached_check(checker=checker, cache=check_cache, url=final_url) @@ -226,6 +300,9 @@ def _handle_rewrites( final_url: Optional[str] = https_check.final_url or https_url if final_url == record.url: continue + file_key = _normalize_state_key(record.file_path) + if _is_ignored_pair(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url): + continue pair = (record.file_path, record.url, final_url) if pair in seen_pairs: @@ -235,6 +312,7 @@ def _handle_rewrites( console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]") console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]") if not _prompt_yes_no(console, "Replace old URL? [y/N] "): + _remember_ignore(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url) continue verification = _cached_check(checker=checker, cache=check_cache, url=final_url) @@ -269,15 +347,36 @@ def main() -> None: console.print("No Markdown files found.") return - old_state = set() if args.rescan else _load_state(STATE_FILE) + single_file_mode = args.path.is_file() + state = {"version": 1, "scan_index": {}, "url_policy": {}} + scan_index: dict[str, object] = {} + url_policy: dict[str, object] = {} files_to_scan: list[Path] = [] skipped_count = 0 - for file_path in all_files: - state_key = _normalize_state_key(file_path) - if state_key in old_state: - skipped_count += 1 - continue - files_to_scan.append(file_path) + + if single_file_mode: + files_to_scan = all_files + else: + state = _load_state(STATE_FILE) + scan_index = state.get("scan_index", {}) + if not isinstance(scan_index, dict): + scan_index = {} + url_policy = state.get("url_policy", {}) + if not isinstance(url_policy, dict): + url_policy = {} + + if args.reset_url_policy: + url_policy = {} + + if args.rescan: + scan_index = {} + + for file_path in all_files: + state_key = _normalize_state_key(file_path) + if state_key in scan_index: + skipped_count += 1 + continue + files_to_scan.append(file_path) console.print( f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}" @@ -287,18 +386,25 @@ def main() -> None: console.print("No new files to scan. Use --rescan to force a full scan.") return + ignored_sources = _ignored_sources_by_file(url_policy) if not single_file_mode else {} records: list[LinkRecord] = [] for file_path in files_to_scan: content = file_path.read_text(encoding="utf-8") - records.extend(scanner.scan_content(file_path=file_path, content=content)) - - new_state = set(old_state) - for file_path in files_to_scan: - new_state.add(_normalize_state_key(file_path)) - _save_state(STATE_FILE, new_state) + file_records = scanner.scan_content(file_path=file_path, content=content) + if not single_file_mode: + file_key = _normalize_state_key(file_path) + ignored_for_file = ignored_sources.get(file_key, set()) + file_records = [record for record in file_records if record.url not in ignored_for_file] + records.extend(file_records) if not records: console.print("No links found.") + if not single_file_mode: + for file_path in files_to_scan: + scan_index[_normalize_state_key(file_path)] = {} + state["scan_index"] = scan_index + state["url_policy"] = url_policy + _save_state(STATE_FILE, state) return urls = unique_preserve_order(record.url for record in records) @@ -334,11 +440,19 @@ def main() -> None: checks=checks, check_codes=check_codes, redirects=redirects, + url_policy=url_policy, checker=checker, editor=editor, console=console, ) + if not single_file_mode: + for file_path in files_to_scan: + scan_index[_normalize_state_key(file_path)] = {} + state["scan_index"] = scan_index + state["url_policy"] = url_policy + _save_state(STATE_FILE, state) + if __name__ == "__main__": main() diff --git a/test.md b/test.md deleted file mode 100644 index 8711be4..0000000 --- a/test.md +++ /dev/null @@ -1,22 +0,0 @@ -# mdlink Test - -- [ok](https://httpbin.org/status/200) -- [redirect](https://github.com/) -- [broken](https://httpbin.org/status/404) -- Naked: https://httpbin.org/status/500 -- ![img](https://httpbin.org/image/png) - -## Additional Cases - -- [redirect with title](https://github.com/ "GitHub redirect") -- [query and fragment](https://example.com/docs?lang=de#intro) -- [duplicate redirect](https://github.com/) -- [duplicate redirect again](https://github.com/) -- [non-http scheme should be ignored](mailto:team@example.org) -- [ftp should be ignored](ftp://speedtest.tele2.net) -- [inline code URL should not be a markdown link](`https://example.org/code`) -- [image in text should be ignored] text before ![logo](www.uph.de) text after -- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics)) -- [trailing punctuation in sentence] See https://example.org/docs, for details. -- autolink angle brackets: -- bare www should be ignored: www.example.org diff --git a/test.sh b/test.sh index 7793392..19e9a1d 100755 --- a/test.sh +++ b/test.sh @@ -1,31 +1,119 @@ #!/usr/bin/env bash set -euo pipefail -OUTPUT_FILE="${1:-test.md}" +STATE_FILE=".mdlink-state.json" +TEST_DIR="testdata" +TEST_FILE="${TEST_DIR}/one.markdown" +BIN="./venv/bin/mdlink" +if [[ ! -x "${BIN}" ]]; then + BIN="./.venv/bin/mdlink" +fi -cat > "${OUTPUT_FILE}" <<'EOF' -# mdlink Test +if command -v rg >/dev/null 2>&1; then + FILTER_CMD="rg" +else + FILTER_CMD="grep -E" +fi + +generate_testdata() { + mkdir -p "${TEST_DIR}" + + cat > "${TEST_DIR}/one.markdown" <<'EOF' +# Test Data One - [ok](https://httpbin.org/status/200) - [redirect](http://github.com) - [broken](https://httpbin.org/status/404) - Naked: https://httpbin.org/status/500 - ![img](https://httpbin.org/image/png) +EOF -## Additional Cases + cat > "${TEST_DIR}/two.markdown" <<'EOF' +# Test Data Two - [redirect with title](http://github.com "GitHub redirect") -- [query and fragment](https://example.com/docs?lang=de#intro) - [duplicate redirect](http://github.com) -- [duplicate redirect again](http://github.com) -- [non-http scheme should be ignored](mailto:team@example.org) - [ftp should be ignored](ftp://speedtest.tele2.net) +- [query and fragment](https://example.com/docs?lang=de#intro) - [inline code URL should not be a markdown link](`https://example.org/code`) -- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after +- autolink angle brackets: +EOF + + cat > "${TEST_DIR}/three.markdown" <<'EOF' +# Test Data Three + - [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics)) - [trailing punctuation in sentence] See https://example.org/docs, for details. -- autolink angle brackets: +- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after +- [mailto should be ignored](mailto:team@example.org) - bare www should be ignored: www.example.org EOF -echo "Generated ${OUTPUT_FILE}" + echo "Generated ${TEST_DIR}/one.markdown" + echo "Generated ${TEST_DIR}/two.markdown" + echo "Generated ${TEST_DIR}/three.markdown" +} + +run_state_checks() { + echo "Using binary: ${BIN}" + echo "Using filter: ${FILTER_CMD}" + echo "Using data dir: ${TEST_DIR}" + echo + + echo "1) First directory scan" + rm -f "${STATE_FILE}" + ${BIN} "${TEST_DIR}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true + echo + + echo "2) Second directory scan (should skip files via scan_index)" + ${BIN} "${TEST_DIR}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|No new files" || true + echo + + echo "3) Single-file scan ignores state (should still scan file)" + ${BIN} "${TEST_FILE}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true + echo + + echo "4) url_policy ignore reduces checked URLs" + cat > "${STATE_FILE}" <<'EOF' +{ + "version": 1, + "scan_index": {}, + "url_policy": { + "testdata/one.markdown": [ + { + "action": "ignore", + "source": "https://httpbin.org/status/404", + "target": "https://httpbin.org/status/404", + "seen_at": "2026-04-17T12:00:00Z" + } + ], + "testdata/two.markdown": [ + { + "action": "ignore", + "source": "http://github.com", + "target": "https://github.com/", + "seen_at": "2026-04-17T12:00:01Z" + } + ] + } +} +EOF + ${BIN} "${TEST_DIR}" --rescan --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true + echo + + echo "5) --reset-url-policy brings ignored URL back into checks" + ${BIN} "${TEST_DIR}" --rescan --reset-url-policy --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true + echo + + echo "Done." +} + +generate_only=false +if [[ "${1:-}" == "--generate-only" ]]; then + generate_only=true +fi + +generate_testdata +if [[ "${generate_only}" == false ]]; then + run_state_checks +fi diff --git a/testdata/one.markdown b/testdata/one.markdown new file mode 100644 index 0000000..0b11029 --- /dev/null +++ b/testdata/one.markdown @@ -0,0 +1,7 @@ +# Test Data One + +- [ok](https://httpbin.org/status/200) +- [redirect](http://github.com) +- [broken](https://httpbin.org/status/404) +- Naked: https://httpbin.org/status/500 +- ![img](https://httpbin.org/image/png) diff --git a/testdata/three.markdown b/testdata/three.markdown new file mode 100644 index 0000000..e7a0736 --- /dev/null +++ b/testdata/three.markdown @@ -0,0 +1,7 @@ +# Test Data Three + +- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics)) +- [trailing punctuation in sentence] See https://example.org/docs, for details. +- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after +- [mailto should be ignored](mailto:team@example.org) +- bare www should be ignored: www.example.org diff --git a/testdata/two.markdown b/testdata/two.markdown new file mode 100644 index 0000000..a61086e --- /dev/null +++ b/testdata/two.markdown @@ -0,0 +1,8 @@ +# Test Data Two + +- [redirect with title](http://github.com "GitHub redirect") +- [duplicate redirect](http://github.com) +- [ftp should be ignored](ftp://speedtest.tele2.net) +- [query and fragment](https://example.com/docs?lang=de#intro) +- [inline code URL should not be a markdown link](`https://example.org/code`) +- autolink angle brackets: