feat: add persistent scan_index/url_policy state with --rescan and --reset-url-policy
This commit is contained in:
@@ -39,24 +39,31 @@ mdlink .
|
|||||||
- `--timeout FLOAT`
|
- `--timeout FLOAT`
|
||||||
Per-request timeout in seconds (default: `10.0`).
|
Per-request timeout in seconds (default: `10.0`).
|
||||||
- `--rescan`
|
- `--rescan`
|
||||||
Discard `.mdlink-state.json` and run a full scan on all matching files.
|
Reset only `scan_index` in `.mdlink-state.json` and run a full scan on all matching files.
|
||||||
|
- `--reset-url-policy`
|
||||||
|
Clear all stored URL ignore policies before scanning.
|
||||||
- `--check CODE`
|
- `--check CODE`
|
||||||
Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
|
Report only selected HTTP status codes. Repeat option for multiple codes (for example `--check 404` or `--check 301 --check 404`).
|
||||||
|
|
||||||
By default, `mdlink` stores scanned file paths in `.mdlink-state.json` and skips those files in later runs.
|
By default, `mdlink` stores:
|
||||||
|
|
||||||
|
- `scan_index`: already scanned files for incremental directory scans.
|
||||||
|
- `url_policy`: ignored URL decisions per file.
|
||||||
|
|
||||||
|
Single-file scans do not use state.
|
||||||
|
|
||||||
## Interactive Redirect Rewrite
|
## Interactive Redirect Rewrite
|
||||||
|
|
||||||
Step 1: When a Markdown link redirects, `mdlink` prompts:
|
Step 1: When a Markdown link redirects, `mdlink` prompts:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Replace old URL with final URL? [y/N]
|
Replace old URL? [y/N]
|
||||||
```
|
```
|
||||||
|
|
||||||
Step 2: For `http://` Markdown links without redirect, `mdlink` can test an `https://` variant and prompt:
|
Step 2: For `http://` Markdown links without redirect, `mdlink` can test an `https://` variant and prompt:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Replace HTTP URL with HTTPS variant? [y/N]
|
Replace old URL? [y/N]
|
||||||
```
|
```
|
||||||
|
|
||||||
Only confirmed links are updated.
|
Only confirmed links are updated.
|
||||||
@@ -65,25 +72,25 @@ Only confirmed links are updated.
|
|||||||
|
|
||||||
This repository includes:
|
This repository includes:
|
||||||
|
|
||||||
- `test.sh`: Generates a sample Markdown test file.
|
- `test.sh`: Generates three sample Markdown files in `testdata/` and runs state-related checks.
|
||||||
- `test.md`: A sample file with mixed link cases (200, redirect, 404/500, naked URLs, ignored image/non-http links).
|
- `testdata/*.markdown`: Sample files with mixed link cases (200, redirect, 404/500, naked URLs, ignored image/non-http links).
|
||||||
|
|
||||||
Generate or overwrite `test.md`:
|
Generate test data and run checks:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./test.sh
|
./test.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Generate to a custom file path:
|
Generate test data only:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./test.sh /tmp/my-test.md
|
./test.sh --generate-only
|
||||||
```
|
```
|
||||||
|
|
||||||
Run the tool against the generated test file:
|
Run the tool against generated test data:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mdlink test.md
|
mdlink testdata
|
||||||
```
|
```
|
||||||
|
|
||||||
## Help
|
## Help
|
||||||
|
|||||||
+137
-23
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -29,7 +30,12 @@ def parse_args() -> argparse.Namespace:
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--rescan",
|
"--rescan",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Discard existing scan state and rescan all matching files.",
|
help="Discard scan_index state and rescan all matching files.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--reset-url-policy",
|
||||||
|
action="store_true",
|
||||||
|
help="Clear all stored URL ignore policies before scanning.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--check",
|
"--check",
|
||||||
@@ -46,24 +52,87 @@ def _normalize_state_key(path: Path) -> str:
|
|||||||
return str(path)
|
return str(path)
|
||||||
|
|
||||||
|
|
||||||
def _load_state(path: Path) -> set[str]:
|
def _load_state(path: Path) -> dict:
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
return set()
|
return {"version": 1, "scan_index": {}, "url_policy": {}}
|
||||||
try:
|
try:
|
||||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||||
except (OSError, json.JSONDecodeError):
|
except (OSError, json.JSONDecodeError):
|
||||||
return set()
|
return {"version": 1, "scan_index": {}, "url_policy": {}}
|
||||||
files = payload.get("files")
|
|
||||||
if not isinstance(files, list):
|
scan_index = payload.get("scan_index")
|
||||||
return set()
|
if not isinstance(scan_index, dict):
|
||||||
return {item for item in files if isinstance(item, str)}
|
scan_index = {}
|
||||||
|
url_policy = payload.get("url_policy")
|
||||||
|
if not isinstance(url_policy, dict):
|
||||||
|
url_policy = {}
|
||||||
|
return {"version": 1, "scan_index": scan_index, "url_policy": url_policy}
|
||||||
|
|
||||||
|
|
||||||
def _save_state(path: Path, scanned_files: set[str]) -> None:
|
def _save_state(path: Path, state: dict) -> None:
|
||||||
payload = {"files": sorted(scanned_files)}
|
payload = {
|
||||||
|
"version": 1,
|
||||||
|
"scan_index": state.get("scan_index", {}),
|
||||||
|
"url_policy": state.get("url_policy", {}),
|
||||||
|
}
|
||||||
path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
|
path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_ignore_entries(entries: object) -> list[dict]:
|
||||||
|
if not isinstance(entries, list):
|
||||||
|
return []
|
||||||
|
result: list[dict] = []
|
||||||
|
for item in entries:
|
||||||
|
if isinstance(item, dict) and item.get("action") == "ignore":
|
||||||
|
result.append(item)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _ignored_sources_by_file(url_policy: dict[str, object]) -> dict[str, set[str]]:
|
||||||
|
ignored: dict[str, set[str]] = {}
|
||||||
|
for file_key, entries in url_policy.items():
|
||||||
|
if not isinstance(file_key, str):
|
||||||
|
continue
|
||||||
|
sources: set[str] = set()
|
||||||
|
for entry in _iter_ignore_entries(entries):
|
||||||
|
source = entry.get("source")
|
||||||
|
if isinstance(source, str) and source:
|
||||||
|
sources.add(source)
|
||||||
|
if sources:
|
||||||
|
ignored[file_key] = sources
|
||||||
|
return ignored
|
||||||
|
|
||||||
|
|
||||||
|
def _is_ignored_pair(url_policy: dict[str, object], file_key: str, source: str, target: str) -> bool:
|
||||||
|
entries = _iter_ignore_entries(url_policy.get(file_key))
|
||||||
|
for entry in entries:
|
||||||
|
if entry.get("source") == source and entry.get("target") == target:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _remember_ignore(url_policy: dict[str, object], file_key: str, source: str, target: str) -> None:
|
||||||
|
entries = _iter_ignore_entries(url_policy.get(file_key))
|
||||||
|
for entry in entries:
|
||||||
|
if entry.get("source") == source and entry.get("target") == target:
|
||||||
|
entry["seen_at"] = _now_iso()
|
||||||
|
url_policy[file_key] = entries
|
||||||
|
return
|
||||||
|
entries.append(
|
||||||
|
{
|
||||||
|
"action": "ignore",
|
||||||
|
"source": source,
|
||||||
|
"target": target,
|
||||||
|
"seen_at": _now_iso(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
url_policy[file_key] = entries
|
||||||
|
|
||||||
|
|
||||||
def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
|
def _is_listed_result(result: LinkCheckResult, check_codes: Optional[set[int]]) -> bool:
|
||||||
if check_codes:
|
if check_codes:
|
||||||
return result.status_code is not None and result.status_code in check_codes
|
return result.status_code is not None and result.status_code in check_codes
|
||||||
@@ -175,6 +244,7 @@ def _handle_rewrites(
|
|||||||
checks: dict[str, LinkCheckResult],
|
checks: dict[str, LinkCheckResult],
|
||||||
check_codes: Optional[set[int]],
|
check_codes: Optional[set[int]],
|
||||||
redirects: list[tuple[LinkRecord, LinkCheckResult]],
|
redirects: list[tuple[LinkRecord, LinkCheckResult]],
|
||||||
|
url_policy: dict[str, object],
|
||||||
checker: LinkChecker,
|
checker: LinkChecker,
|
||||||
editor: ASTMarkdownEditor,
|
editor: ASTMarkdownEditor,
|
||||||
console: Console,
|
console: Console,
|
||||||
@@ -193,6 +263,9 @@ def _handle_rewrites(
|
|||||||
final_url = result.final_url
|
final_url = result.final_url
|
||||||
if not final_url:
|
if not final_url:
|
||||||
continue
|
continue
|
||||||
|
file_key = _normalize_state_key(record.file_path)
|
||||||
|
if _is_ignored_pair(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url):
|
||||||
|
continue
|
||||||
pair = (record.file_path, record.url, final_url)
|
pair = (record.file_path, record.url, final_url)
|
||||||
if pair in seen_pairs:
|
if pair in seen_pairs:
|
||||||
continue
|
continue
|
||||||
@@ -201,6 +274,7 @@ def _handle_rewrites(
|
|||||||
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||||
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||||
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
|
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
|
||||||
|
_remember_ignore(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
||||||
@@ -226,6 +300,9 @@ def _handle_rewrites(
|
|||||||
final_url: Optional[str] = https_check.final_url or https_url
|
final_url: Optional[str] = https_check.final_url or https_url
|
||||||
if final_url == record.url:
|
if final_url == record.url:
|
||||||
continue
|
continue
|
||||||
|
file_key = _normalize_state_key(record.file_path)
|
||||||
|
if _is_ignored_pair(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url):
|
||||||
|
continue
|
||||||
|
|
||||||
pair = (record.file_path, record.url, final_url)
|
pair = (record.file_path, record.url, final_url)
|
||||||
if pair in seen_pairs:
|
if pair in seen_pairs:
|
||||||
@@ -235,6 +312,7 @@ def _handle_rewrites(
|
|||||||
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
console.print(f"\n[cyan]{record.file_path}:{record.line}[/cyan]")
|
||||||
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
console.print(f"[yellow]{record.url}[/yellow] -> [green]{final_url}[/green]")
|
||||||
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
|
if not _prompt_yes_no(console, "Replace old URL? [y/N] "):
|
||||||
|
_remember_ignore(url_policy=url_policy, file_key=file_key, source=record.url, target=final_url)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
verification = _cached_check(checker=checker, cache=check_cache, url=final_url)
|
||||||
@@ -269,15 +347,36 @@ def main() -> None:
|
|||||||
console.print("No Markdown files found.")
|
console.print("No Markdown files found.")
|
||||||
return
|
return
|
||||||
|
|
||||||
old_state = set() if args.rescan else _load_state(STATE_FILE)
|
single_file_mode = args.path.is_file()
|
||||||
|
state = {"version": 1, "scan_index": {}, "url_policy": {}}
|
||||||
|
scan_index: dict[str, object] = {}
|
||||||
|
url_policy: dict[str, object] = {}
|
||||||
files_to_scan: list[Path] = []
|
files_to_scan: list[Path] = []
|
||||||
skipped_count = 0
|
skipped_count = 0
|
||||||
for file_path in all_files:
|
|
||||||
state_key = _normalize_state_key(file_path)
|
if single_file_mode:
|
||||||
if state_key in old_state:
|
files_to_scan = all_files
|
||||||
skipped_count += 1
|
else:
|
||||||
continue
|
state = _load_state(STATE_FILE)
|
||||||
files_to_scan.append(file_path)
|
scan_index = state.get("scan_index", {})
|
||||||
|
if not isinstance(scan_index, dict):
|
||||||
|
scan_index = {}
|
||||||
|
url_policy = state.get("url_policy", {})
|
||||||
|
if not isinstance(url_policy, dict):
|
||||||
|
url_policy = {}
|
||||||
|
|
||||||
|
if args.reset_url_policy:
|
||||||
|
url_policy = {}
|
||||||
|
|
||||||
|
if args.rescan:
|
||||||
|
scan_index = {}
|
||||||
|
|
||||||
|
for file_path in all_files:
|
||||||
|
state_key = _normalize_state_key(file_path)
|
||||||
|
if state_key in scan_index:
|
||||||
|
skipped_count += 1
|
||||||
|
continue
|
||||||
|
files_to_scan.append(file_path)
|
||||||
|
|
||||||
console.print(
|
console.print(
|
||||||
f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}"
|
f"Files total: {len(all_files)} | to scan: {len(files_to_scan)} | skipped: {skipped_count}"
|
||||||
@@ -287,18 +386,25 @@ def main() -> None:
|
|||||||
console.print("No new files to scan. Use --rescan to force a full scan.")
|
console.print("No new files to scan. Use --rescan to force a full scan.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
ignored_sources = _ignored_sources_by_file(url_policy) if not single_file_mode else {}
|
||||||
records: list[LinkRecord] = []
|
records: list[LinkRecord] = []
|
||||||
for file_path in files_to_scan:
|
for file_path in files_to_scan:
|
||||||
content = file_path.read_text(encoding="utf-8")
|
content = file_path.read_text(encoding="utf-8")
|
||||||
records.extend(scanner.scan_content(file_path=file_path, content=content))
|
file_records = scanner.scan_content(file_path=file_path, content=content)
|
||||||
|
if not single_file_mode:
|
||||||
new_state = set(old_state)
|
file_key = _normalize_state_key(file_path)
|
||||||
for file_path in files_to_scan:
|
ignored_for_file = ignored_sources.get(file_key, set())
|
||||||
new_state.add(_normalize_state_key(file_path))
|
file_records = [record for record in file_records if record.url not in ignored_for_file]
|
||||||
_save_state(STATE_FILE, new_state)
|
records.extend(file_records)
|
||||||
|
|
||||||
if not records:
|
if not records:
|
||||||
console.print("No links found.")
|
console.print("No links found.")
|
||||||
|
if not single_file_mode:
|
||||||
|
for file_path in files_to_scan:
|
||||||
|
scan_index[_normalize_state_key(file_path)] = {}
|
||||||
|
state["scan_index"] = scan_index
|
||||||
|
state["url_policy"] = url_policy
|
||||||
|
_save_state(STATE_FILE, state)
|
||||||
return
|
return
|
||||||
|
|
||||||
urls = unique_preserve_order(record.url for record in records)
|
urls = unique_preserve_order(record.url for record in records)
|
||||||
@@ -334,11 +440,19 @@ def main() -> None:
|
|||||||
checks=checks,
|
checks=checks,
|
||||||
check_codes=check_codes,
|
check_codes=check_codes,
|
||||||
redirects=redirects,
|
redirects=redirects,
|
||||||
|
url_policy=url_policy,
|
||||||
checker=checker,
|
checker=checker,
|
||||||
editor=editor,
|
editor=editor,
|
||||||
console=console,
|
console=console,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not single_file_mode:
|
||||||
|
for file_path in files_to_scan:
|
||||||
|
scan_index[_normalize_state_key(file_path)] = {}
|
||||||
|
state["scan_index"] = scan_index
|
||||||
|
state["url_policy"] = url_policy
|
||||||
|
_save_state(STATE_FILE, state)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
# mdlink Test
|
|
||||||
|
|
||||||
- [ok](https://httpbin.org/status/200)
|
|
||||||
- [redirect](https://github.com/)
|
|
||||||
- [broken](https://httpbin.org/status/404)
|
|
||||||
- Naked: https://httpbin.org/status/500
|
|
||||||
- 
|
|
||||||
|
|
||||||
## Additional Cases
|
|
||||||
|
|
||||||
- [redirect with title](https://github.com/ "GitHub redirect")
|
|
||||||
- [query and fragment](https://example.com/docs?lang=de#intro)
|
|
||||||
- [duplicate redirect](https://github.com/)
|
|
||||||
- [duplicate redirect again](https://github.com/)
|
|
||||||
- [non-http scheme should be ignored](mailto:team@example.org)
|
|
||||||
- [ftp should be ignored](ftp://speedtest.tele2.net)
|
|
||||||
- [inline code URL should not be a markdown link](`https://example.org/code`)
|
|
||||||
- [image in text should be ignored] text before  text after
|
|
||||||
- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
|
|
||||||
- [trailing punctuation in sentence] See https://example.org/docs, for details.
|
|
||||||
- autolink angle brackets: <https://example.org/autolink>
|
|
||||||
- bare www should be ignored: www.example.org
|
|
||||||
@@ -1,31 +1,119 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
OUTPUT_FILE="${1:-test.md}"
|
STATE_FILE=".mdlink-state.json"
|
||||||
|
TEST_DIR="testdata"
|
||||||
|
TEST_FILE="${TEST_DIR}/one.markdown"
|
||||||
|
BIN="./venv/bin/mdlink"
|
||||||
|
if [[ ! -x "${BIN}" ]]; then
|
||||||
|
BIN="./.venv/bin/mdlink"
|
||||||
|
fi
|
||||||
|
|
||||||
cat > "${OUTPUT_FILE}" <<'EOF'
|
if command -v rg >/dev/null 2>&1; then
|
||||||
# mdlink Test
|
FILTER_CMD="rg"
|
||||||
|
else
|
||||||
|
FILTER_CMD="grep -E"
|
||||||
|
fi
|
||||||
|
|
||||||
|
generate_testdata() {
|
||||||
|
mkdir -p "${TEST_DIR}"
|
||||||
|
|
||||||
|
cat > "${TEST_DIR}/one.markdown" <<'EOF'
|
||||||
|
# Test Data One
|
||||||
|
|
||||||
- [ok](https://httpbin.org/status/200)
|
- [ok](https://httpbin.org/status/200)
|
||||||
- [redirect](http://github.com)
|
- [redirect](http://github.com)
|
||||||
- [broken](https://httpbin.org/status/404)
|
- [broken](https://httpbin.org/status/404)
|
||||||
- Naked: https://httpbin.org/status/500
|
- Naked: https://httpbin.org/status/500
|
||||||
- 
|
- 
|
||||||
|
EOF
|
||||||
|
|
||||||
## Additional Cases
|
cat > "${TEST_DIR}/two.markdown" <<'EOF'
|
||||||
|
# Test Data Two
|
||||||
|
|
||||||
- [redirect with title](http://github.com "GitHub redirect")
|
- [redirect with title](http://github.com "GitHub redirect")
|
||||||
- [query and fragment](https://example.com/docs?lang=de#intro)
|
|
||||||
- [duplicate redirect](http://github.com)
|
- [duplicate redirect](http://github.com)
|
||||||
- [duplicate redirect again](http://github.com)
|
|
||||||
- [non-http scheme should be ignored](mailto:team@example.org)
|
|
||||||
- [ftp should be ignored](ftp://speedtest.tele2.net)
|
- [ftp should be ignored](ftp://speedtest.tele2.net)
|
||||||
|
- [query and fragment](https://example.com/docs?lang=de#intro)
|
||||||
- [inline code URL should not be a markdown link](`https://example.org/code`)
|
- [inline code URL should not be a markdown link](`https://example.org/code`)
|
||||||
- [image in text should be ignored] text before  text after
|
- autolink angle brackets: <https://example.org/autolink>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > "${TEST_DIR}/three.markdown" <<'EOF'
|
||||||
|
# Test Data Three
|
||||||
|
|
||||||
- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
|
- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
|
||||||
- [trailing punctuation in sentence] See https://example.org/docs, for details.
|
- [trailing punctuation in sentence] See https://example.org/docs, for details.
|
||||||
- autolink angle brackets: <https://example.org/autolink>
|
- [image in text should be ignored] text before  text after
|
||||||
|
- [mailto should be ignored](mailto:team@example.org)
|
||||||
- bare www should be ignored: www.example.org
|
- bare www should be ignored: www.example.org
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
echo "Generated ${OUTPUT_FILE}"
|
echo "Generated ${TEST_DIR}/one.markdown"
|
||||||
|
echo "Generated ${TEST_DIR}/two.markdown"
|
||||||
|
echo "Generated ${TEST_DIR}/three.markdown"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_state_checks() {
|
||||||
|
echo "Using binary: ${BIN}"
|
||||||
|
echo "Using filter: ${FILTER_CMD}"
|
||||||
|
echo "Using data dir: ${TEST_DIR}"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "1) First directory scan"
|
||||||
|
rm -f "${STATE_FILE}"
|
||||||
|
${BIN} "${TEST_DIR}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "2) Second directory scan (should skip files via scan_index)"
|
||||||
|
${BIN} "${TEST_DIR}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|No new files" || true
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "3) Single-file scan ignores state (should still scan file)"
|
||||||
|
${BIN} "${TEST_FILE}" --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "4) url_policy ignore reduces checked URLs"
|
||||||
|
cat > "${STATE_FILE}" <<'EOF'
|
||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"scan_index": {},
|
||||||
|
"url_policy": {
|
||||||
|
"testdata/one.markdown": [
|
||||||
|
{
|
||||||
|
"action": "ignore",
|
||||||
|
"source": "https://httpbin.org/status/404",
|
||||||
|
"target": "https://httpbin.org/status/404",
|
||||||
|
"seen_at": "2026-04-17T12:00:00Z"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"testdata/two.markdown": [
|
||||||
|
{
|
||||||
|
"action": "ignore",
|
||||||
|
"source": "http://github.com",
|
||||||
|
"target": "https://github.com/",
|
||||||
|
"seen_at": "2026-04-17T12:00:01Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
${BIN} "${TEST_DIR}" --rescan --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "5) --reset-url-policy brings ignored URL back into checks"
|
||||||
|
${BIN} "${TEST_DIR}" --rescan --reset-url-policy --timeout 0.5 --check 404 | ${FILTER_CMD} "^Files total|^Checking " || true
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
}
|
||||||
|
|
||||||
|
generate_only=false
|
||||||
|
if [[ "${1:-}" == "--generate-only" ]]; then
|
||||||
|
generate_only=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
generate_testdata
|
||||||
|
if [[ "${generate_only}" == false ]]; then
|
||||||
|
run_state_checks
|
||||||
|
fi
|
||||||
|
|||||||
Vendored
+7
@@ -0,0 +1,7 @@
|
|||||||
|
# Test Data One
|
||||||
|
|
||||||
|
- [ok](https://httpbin.org/status/200)
|
||||||
|
- [redirect](http://github.com)
|
||||||
|
- [broken](https://httpbin.org/status/404)
|
||||||
|
- Naked: https://httpbin.org/status/500
|
||||||
|
- 
|
||||||
Vendored
+7
@@ -0,0 +1,7 @@
|
|||||||
|
# Test Data Three
|
||||||
|
|
||||||
|
- [parentheses in URL](https://en.wikipedia.org/wiki/Function_(mathematics))
|
||||||
|
- [trailing punctuation in sentence] See https://example.org/docs, for details.
|
||||||
|
- [image in text should be ignored] text before  text after
|
||||||
|
- [mailto should be ignored](mailto:team@example.org)
|
||||||
|
- bare www should be ignored: www.example.org
|
||||||
Vendored
+8
@@ -0,0 +1,8 @@
|
|||||||
|
# Test Data Two
|
||||||
|
|
||||||
|
- [redirect with title](http://github.com "GitHub redirect")
|
||||||
|
- [duplicate redirect](http://github.com)
|
||||||
|
- [ftp should be ignored](ftp://speedtest.tele2.net)
|
||||||
|
- [query and fragment](https://example.com/docs?lang=de#intro)
|
||||||
|
- [inline code URL should not be a markdown link](`https://example.org/code`)
|
||||||
|
- autolink angle brackets: <https://example.org/autolink>
|
||||||
Reference in New Issue
Block a user