From 86d7dc64d41255ff33af9924f914d44604ab7346 Mon Sep 17 00:00:00 2001
From: drg <gammlaa@chaospott.de>
Date: Fri, 17 Apr 2026 21:30:26 +0200
Subject: [PATCH] feat: add persistent scan_index/url_policy state with
 --rescan and --reset-url-policy

---
 README.md               |  29 +++++---
 mdlink/cli.py           | 160 ++++++++++++++++++++++++++++++++++------
 test.md                 |  22 ------
 test.sh                 | 108 ++++++++++++++++++++++++---
 testdata/one.markdown   |   7 ++
 testdata/three.markdown |   7 ++
 testdata/two.markdown   |   8 ++
 7 files changed, 275 insertions(+), 66 deletions(-)
 delete mode 100644 test.md
 create mode 100644 testdata/one.markdown
 create mode 100644 testdata/three.markdown
 create mode 100644 testdata/two.markdown

diff --git a/README.md b/README.md
index 055e948..3ef6105 100644
--- a/README.md
+++ b/README.md
@@ -39,24 +39,31 @@ mdlink .
 - `--timeout FLOAT`  
   Per-request timeout in seconds (default: `10.0`).
 - `--rescan`  
-  Discard `.mdlink-state.json` and run a full scan on all matching files.
+  Reset only `scan_index` in `.mdlink-state.json` and run a full scan on all matching files.
+- `--reset-url-policy`  
+  Clear all stored URL ignore policies before scanning.
 - `--check CODE`  
   Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
 
-By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs.
+By default, `mdlink` stores:
+
+- `scan_index`: already scanned files for incremental directory scans.
+- `url_policy`: ignored URL decisions per file.
+
+Single-file scans do not use state.
 
 ## Interactive Redirect Rewrite
 
 Step 1: When a Markdown link redirects, `mdlink` prompts:
 
 ```text
-Replace old URL with final URL? [y/N]
+Replace old URL? [y/N]
 ```
 
 Step 2: For `http://` Markdown links without redirect, `mdlink` can test an `https://` variant and prompt:
 
 ```text
-Replace HTTP URL with HTTPS variant? [y/N]
+Replace old URL? [y/N]
 ```
 
 Only confirmed links are updated.
@@ -65,25 +72,25 @@ Only confirmed links are updated.
 
 This repository includes:
 
-- `test.sh`: Generates a sample Markdown test file.
-- `test.md`: A sample file with mixed link cases (200, redirect, 404/500, naked URLs, ignored image/non-http links).
+- `test.sh`: Generates three sample Markdown files in `testdata/` and runs state-related checks.
+- `testdata/*.markdown`: Sample files with mixed link cases (200, redirect, 404/500, naked URLs, ignored image/non-http links).
 
-Generate or overwrite `test.md`:
+Generate test data and run checks:
 
 ```bash
 ./test.sh
 ```
 
-Generate to a custom file path:
+Generate test data only:
 
 ```bash
-./test.sh /tmp/my-test.md
+./test.sh --generate-only
 ```
 
-Run the tool against the generated test file:
+Run the tool against generated test data:
 
 ```bash
-mdlink test.md
+mdlink testdata
 ```
 
 ## Help
diff --git a/mdlink/cli.py b/mdlink/cli.py
index 9fbf958..590e185 100644
--- a/mdlink/cli.py
+++ b/mdlink/cli.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import argparse
 import json
 from collections import defaultdict
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Optional
 
@@ -29,7 +30,12 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--rescan",
         action="store_true",
-        help="Discard existing scan state and rescan all matching files.",
+        help="Discard scan_index state and rescan all matching files.",
+    )
+    parser.add_argument(
+        "--reset-url-policy",
+        action="store_true",
+        help="Clear all stored URL ignore policies before scanning.",
     )
     parser.add_argument(
         "--check",
@@ -46,24 +52,87 @@ def _normalize_state_key(path: Path) -> str:
     return str(path)
 
 
-def _load_state(path: Path) -> set[str]:
+def _load_state(path: Path) -> dict:
     if not path.exists():
-        return set()
+        return {"version": 1, "scan_index": {}, "url_policy": {}}
     try:
         payload = json.loads(path.read_text(encoding="utf-8"))
     except (OSError, json.JSONDecodeError):
-        return set()
-    files = payload.get("files")
-    if not isinstance(files, list):
-        return set()
-    return {item for item in files if isinstance(item, str)}
+        return {"version": 1, "scan_index": {}, "url_policy": {}}
+
+    scan_index = payload.get("scan_index")
+    if not isinstance(scan_index, dict):
+        scan_index = {}
+    url_policy = payload.get("url_policy")
+    if not isinstance(url_policy, dict):
+        url_policy = {}
+    return {"version": 1, "scan_index": scan_index, "url_policy": url_policy}
 
 
-def _save_state(path: Path, scanned_files: set[str]) -> None:
-    payload = {"files": sorted(scanned_files)}
+def _save_state(path: Path, state: dict) -> None:
+    payload = {
+        "version": 1,
+        "scan_index": state.get("scan_index", {}),
+        "url_policy": state.get("url_policy", {}),
+    }
     path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
 
 
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def _iter_ignore_entries(entries: object) -> list[dict]:
+    if not isinstance(entries, list):
+        return []
+    result: list[dict] = []
+    for item in entries:
+        if isinstance(item, dict) and item.get("action") == "ignore":
+            result.append(item)
+    return result
+
+
+def _ignored_sources_by_file(url_policy: dict[str, object]) -> dict[str, set[str]]:
+    ignored: dict[str, set[str]] = {}
+    for file_key, entries in url_policy.items():
+        if not isinstance(file_key, str):
+            continue
+        sources: set[str] = set()
+        for entry in _iter_ignore_entries(entries):
+            source = entry.get("source")
+            if isinstance(source, str) and source:
+                sources.add(source)
+        if sources:
+            ignored[file_key] = sources
+    return ignored
+
+
+def _is_ignored_pair(url_policy: dict[str, object], file_key: str, source: str, target: str) -> bool:
+    entries = _iter_ignore_entries(url_policy.get(file_key))
+    for entry in entries:
+        if entry.get("source") == source and entry.get("target") == target:
+            return True
+    return False
+
+
+def _remember_ignore(url_policy: dict[str, object], file_key: str, source: str, target: str) -> None:
+    entries = _iter_ignore_entries(url_policy.get(file_key))
+    for entry in entries:
+        if entry.get("source") == source and entry.get("target") == target:
+            entry["seen_at"] = _now_iso()
+            url_policy[file_key] = entries
+            return
+    entries.append(
+        {
+            "action": "ignore",
+            "source": source,
+            "target": target,
+            "seen_at": _now_iso(),
+        }
+    )
+    url_policy[file_key] = entries
+
+
 def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
     if check_codes:
         return result.status_code is not None and result.status_code in check_codes
@@ -175,6 +244,7 @@ def _handle_rewrites(
     checks: dict[str, LinkCheckResult],
     check_codes: Optional[set[int]],
     redirects: list[tuple[LinkRecord, LinkCheckResult]],
+    url_policy: dict[str, object],
     checker: LinkChecker,
     editor: ASTMarkdownEditor,
     console: Console,
@@ -193,6 +263,9 @@ def _handle_rewrites(
             final_url = result.final_url
             if not final_url:
                 continue
+            file_key = _normalize_state_key(record.file_path)
+            if _is_ignored_pair(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url):
+                continue
             pair = (record.file_path, record.url, final_url)
             if pair in seen_pairs:
                 continue
@@ -201,6 +274,7 @@ def _handle_rewrites(
             console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
             console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
             if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
+                _remember_ignore(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url)
                 continue
 
             verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
@@ -226,6 +300,9 @@ def _handle_rewrites(
             final_url: Optional[str] = https_check.final_url or https_url
             if final_url == record.url:
                 continue
+            file_key = _normalize_state_key(record.file_path)
+            if _is_ignored_pair(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url):
+                continue
 
             pair = (record.file_path, record.url, final_url)
             if pair in seen_pairs:
@@ -235,6 +312,7 @@ def _handle_rewrites(
             console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
             console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
             if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
+                _remember_ignore(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url)
                 continue
 
             verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
@@ -269,15 +347,36 @@ def main() -> None:
         console.print("No Markdown files found.")
         return
 
-    old_state = set() if args.rescan else _load_state(STATE_FILE)
+    single_file_mode = args.path.is_file()
+    state = {"version": 1, "scan_index": {}, "url_policy": {}}
+    scan_index: dict[str, object] = {}
+    url_policy: dict[str, object] = {}
     files_to_scan: list[Path] = []
     skipped_count = 0
-    for file_path in all_files:
-        state_key = _normalize_state_key(file_path)
-        if state_key in old_state:
-            skipped_count += 1
-            continue
-        files_to_scan.append(file_path)
+
+    if single_file_mode:
+        files_to_scan = all_files
+    else:
+        state = _load_state(STATE_FILE)
+        scan_index = state.get("scan_index", {})
+        if not isinstance(scan_index, dict):
+            scan_index = {}
+        url_policy = state.get("url_policy", {})
+        if not isinstance(url_policy, dict):
+            url_policy = {}
+
+        if args.reset_url_policy:
+            url_policy = {}
+
+        if args.rescan:
+            scan_index = {}
+
+        for file_path in all_files:
+            state_key = _normalize_state_key(file_path)
+            if state_key in scan_index:
+                skipped_count += 1
+                continue
+            files_to_scan.append(file_path)
 
     console.print(
         f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}"
@@ -287,18 +386,25 @@ def main() -> None:
         console.print("No new files to scan. Use --rescan to force a full scan.")
         return
 
+    ignored_sources = _ignored_sources_by_file(url_policy) if not single_file_mode else {}
     records: list[LinkRecord] = []
     for file_path in files_to_scan:
         content = file_path.read_text(encoding="utf-8")
-        records.extend(scanner.scan_content(file_path=file_path, content=content))
-
-    new_state = set(old_state)
-    for file_path in files_to_scan:
-        new_state.add(_normalize_state_key(file_path))
-    _save_state(STATE_FILE, new_state)
+        file_records = scanner.scan_content(file_path=file_path, content=content)
+        if not single_file_mode:
+            file_key = _normalize_state_key(file_path)
+            ignored_for_file = ignored_sources.get(file_key, set())
+            file_records = [record for record in file_records if record.url not in ignored_for_file]
+        records.extend(file_records)
 
     if not records:
         console.print("No links found.")
+        if not single_file_mode:
+            for file_path in files_to_scan:
+                scan_index[_normalize_state_key(file_path)] = {}
+            state["scan_index"] = scan_index
+            state["url_policy"] = url_policy
+            _save_state(STATE_FILE, state)
         return
 
     urls = unique_preserve_order(record.url for record in records)
@@ -334,11 +440,19 @@ def main() -> None:
             checks=checks,
             check_codes=check_codes,
             redirects=redirects,
+            url_policy=url_policy,
             checker=checker,
             editor=editor,
             console=console,
         )
 
+    if not single_file_mode:
+        for file_path in files_to_scan:
+            scan_index[_normalize_state_key(file_path)] = {}
+        state["scan_index"] = scan_index
+        state["url_policy"] = url_policy
+        _save_state(STATE_FILE, state)
+
 
 if __name__ == "__main__":
     main()
diff --git a/test.md b/test.md
deleted file mode 100644
index 8711be4..0000000
--- a/test.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# mdlink Test
-
-- [ok](https://httpbin.org/status/200)
-- [redirect](https://github.com/)
-- [broken](https://httpbin.org/status/404)
-- Naked: https://httpbin.org/status/500
-- ![img](https://httpbin.org/image/png)
-
-## Additional Cases
-
-- [redirect with title](https://github.com/ "GitHub redirect")
-- [query and fragment](https://example.com/docs?lang=de#intro)
-- [duplicate redirect](https://github.com/)
-- [duplicate redirect again](https://github.com/)
-- [non-http scheme should be ignored](mailto:team@example.org)
-- [ftp should be ignored](ftp://speedtest.tele2.net)
-- [inline code URL should not be a markdown link](`https://example.org/code`)
-- [image in text should be ignored] text before ![logo](www.uph.de) text after
-- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
-- [trailing punctuation in sentence] See https://example.org/docs, for details.
-- autolink angle brackets: <https://example.org/autolink>
-- bare www should be ignored: www.example.org
diff --git a/test.sh b/test.sh
index 7793392..19e9a1d 100755
--- a/test.sh
+++ b/test.sh
@@ -1,31 +1,119 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-OUTPUT_FILE="${1:-test.md}"
+STATE_FILE=".mdlink-state.json"
+TEST_DIR="testdata"
+TEST_FILE="${TEST_DIR}/one.markdown"
+BIN="./venv/bin/mdlink"
+if [[ ! -x "${BIN}" ]]; then
+  BIN="./.venv/bin/mdlink"
+fi
 
-cat > "${OUTPUT_FILE}" <<'EOF'
-# mdlink Test
+if command -v rg >/dev/null 2>&1; then
+  FILTER_CMD="rg"
+else
+  FILTER_CMD="grep -E"
+fi
+
+generate_testdata() {
+  mkdir -p "${TEST_DIR}"
+
+  cat > "${TEST_DIR}/one.markdown" <<'EOF'
+# Test Data One
 
 - [ok](https://httpbin.org/status/200)
 - [redirect](http://github.com)
 - [broken](https://httpbin.org/status/404)
 - Naked: https://httpbin.org/status/500
 - ![img](https://httpbin.org/image/png)
+EOF
 
-## Additional Cases
+  cat > "${TEST_DIR}/two.markdown" <<'EOF'
+# Test Data Two
 
 - [redirect with title](http://github.com "GitHub redirect")
-- [query and fragment](https://example.com/docs?lang=de#intro)
 - [duplicate redirect](http://github.com)
-- [duplicate redirect again](http://github.com)
-- [non-http scheme should be ignored](mailto:team@example.org)
 - [ftp should be ignored](ftp://speedtest.tele2.net)
+- [query and fragment](https://example.com/docs?lang=de#intro)
 - [inline code URL should not be a markdown link](`https://example.org/code`)
-- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after
+- autolink angle brackets: <https://example.org/autolink>
+EOF
+
+  cat > "${TEST_DIR}/three.markdown" <<'EOF'
+# Test Data Three
+
 - [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
 - [trailing punctuation in sentence] See https://example.org/docs, for details.
-- autolink angle brackets: <https://example.org/autolink>
+- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after
+- [mailto should be ignored](mailto:team@example.org)
 - bare www should be ignored: www.example.org
 EOF
 
-echo "Generated ${OUTPUT_FILE}"
+  echo "Generated ${TEST_DIR}/one.markdown"
+  echo "Generated ${TEST_DIR}/two.markdown"
+  echo "Generated ${TEST_DIR}/three.markdown"
+}
+
+run_state_checks() {
+  echo "Using binary: ${BIN}"
+  echo "Using filter: ${FILTER_CMD}"
+  echo "Using data dir: ${TEST_DIR}"
+  echo
+
+  echo "1) First directory scan"
+  rm -f "${STATE_FILE}"
+  ${BIN} "${TEST_DIR}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true
+  echo
+
+  echo "2) Second directory scan (should skip files via scan_index)"
+  ${BIN} "${TEST_DIR}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|No new files" || true
+  echo
+
+  echo "3) Single-file scan ignores state (should still scan file)"
+  ${BIN} "${TEST_FILE}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true
+  echo
+
+  echo "4) url_policy ignore reduces checked URLs"
+  cat > "${STATE_FILE}" <<'EOF'
+{
+  "version": 1,
+  "scan_index": {},
+  "url_policy": {
+    "testdata/one.markdown": [
+      {
+        "action": "ignore",
+        "source": "https://httpbin.org/status/404",
+        "target": "https://httpbin.org/status/404",
+        "seen_at": "2026-04-17T12:00:00Z"
+      }
+    ],
+    "testdata/two.markdown": [
+      {
+        "action": "ignore",
+        "source": "http://github.com",
+        "target": "https://github.com/",
+        "seen_at": "2026-04-17T12:00:01Z"
+      }
+    ]
+  }
+}
+EOF
+  ${BIN} "${TEST_DIR}" --rescan --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true
+  echo
+
+  echo "5) --reset-url-policy brings ignored URL back into checks"
+  ${BIN} "${TEST_DIR}" --rescan --reset-url-policy --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true
+  echo
+
+  echo "Done."
+}
+
+generate_only=false
+if [[ "${1:-}" == "--generate-only" ]]; then
+  generate_only=true
+fi
+
+generate_testdata
+if [[ "${generate_only}" == false ]]; then
+  run_state_checks
+fi
diff --git a/testdata/one.markdown b/testdata/one.markdown
new file mode 100644
index 0000000..0b11029
--- /dev/null
+++ b/testdata/one.markdown
@@ -0,0 +1,7 @@
+# Test Data One
+
+- [ok](https://httpbin.org/status/200)
+- [redirect](http://github.com)
+- [broken](https://httpbin.org/status/404)
+- Naked: https://httpbin.org/status/500
+- ![img](https://httpbin.org/image/png)
diff --git a/testdata/three.markdown b/testdata/three.markdown
new file mode 100644
index 0000000..e7a0736
--- /dev/null
+++ b/testdata/three.markdown
@@ -0,0 +1,7 @@
+# Test Data Three
+
+- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
+- [trailing punctuation in sentence] See https://example.org/docs, for details.
+- [image in text should be ignored] text before ![logo](https://example.com/logo.png) text after
+- [mailto should be ignored](mailto:team@example.org)
+- bare www should be ignored: www.example.org
diff --git a/testdata/two.markdown b/testdata/two.markdown
new file mode 100644
index 0000000..a61086e
--- /dev/null
+++ b/testdata/two.markdown
@@ -0,0 +1,8 @@
+# Test Data Two
+
+- [redirect with title](http://github.com "GitHub redirect")
+- [duplicate redirect](http://github.com)
+- [ftp should be ignored](ftp://speedtest.tele2.net)
+- [query and fragment](https://example.com/docs?lang=de#intro)
+- [inline code URL should not be a markdown link](`https://example.org/code`)
+- autolink angle brackets: <https://example.org/autolink>