הראה קוד מקור ל duplicate_detector

from __future__ import annotations

import hashlib
import os
from pathlib import Path
from typing import Dict, Iterable, List


def _iter_files(base: Path, includes: List[str]) -> Iterable[Path]:
    if not includes:
        # default: all files under base
        for p in base.rglob("*"):
            if p.is_file():
                yield p
        return
    # respect globs
    for pattern in includes:
        for p in base.glob(pattern):
            if p.is_file():
                yield p


def _split_norm_lines(text: str) -> List[str]:
    """Split text into normalized lines with trailing blanks removed.

    - Convert CRLF/CR to LF
    - Rstrip trailing spaces on each line
    - Drop trailing empty lines so a trailing newline does not change content
    """
    s = text.replace("\r\n", "\n").replace("\r", "\n")
    lines = [line.rstrip() for line in s.split("\n")]
    # Remove trailing empty lines
    while lines and lines[-1] == "":
        lines.pop()
    return lines


def _norm_content(text: str) -> str:
    # Normalize content using the split helper above
    return "\n".join(_split_norm_lines(text))



[תיעוד]
def scan_duplicates(base_path: str, *, includes: List[str], min_lines: int = 5, max_files: int = 500) -> Dict[str, List[str]]:
    """
    Find exact duplicate files by normalized content.

    Returns mapping: content_hash -> list of relative file paths (len>=2).
    """
    base = Path(base_path or ".").resolve()
    files: List[Path] = []
    seen: set[str] = set()
    for p in _iter_files(base, includes):
        # De-duplicate same path from overlapping globs
        rp = str(p.resolve())
        if rp in seen:
            continue
        seen.add(rp)
        files.append(p)
        if len(files) >= max_files:
            break

    groups: Dict[str, List[str]] = {}
    for p in files:
        try:
            text = p.read_text(encoding="utf-8", errors="ignore")
        except Exception:
            continue
        # Count effective lines after normalization (ignoring trailing blanks)
        eff_lines = len(_split_norm_lines(text)) if text else 0
        if eff_lines < max(1, int(min_lines or 1)):
            continue
        norm = _norm_content(text)
        h = hashlib.sha1(norm.encode("utf-8")).hexdigest()
        rel = os.path.relpath(str(p), str(base))
        groups.setdefault(h, []).append(rel)

    # Keep only hashes with duplicates
    return {h: paths for h, paths in groups.items() if len(paths) >= 2}