הראה קוד מקור ל duplicate_detector

from __future__ import annotations

import hashlib
import os
from pathlib import Path
from typing import Dict, Iterable, List


def _iter_files(base: Path, includes: List[str]) -> Iterable[Path]:
    if not includes:
        # default: all files under base
        for p in base.rglob("*"):
            if p.is_file():
                yield p
        return
    # respect globs
    for pattern in includes:
        for p in base.glob(pattern):
            if p.is_file():
                yield p


def _split_norm_lines(text: str) -> List[str]:
    """Split text into normalized lines with trailing blanks removed.

    - Convert CRLF/CR to LF
    - Rstrip trailing spaces on each line
    - Drop trailing empty lines so a trailing newline does not change content
    """
    s = text.replace("\r\n", "\n").replace("\r", "\n")
    lines = [line.rstrip() for line in s.split("\n")]
    # Remove trailing empty lines
    while lines and lines[-1] == "":
        lines.pop()
    return lines


def _norm_content(text: str) -> str:
    # Normalize content using the split helper above
    return "\n".join(_split_norm_lines(text))


[תיעוד] def scan_duplicates(base_path: str, *, includes: List[str], min_lines: int = 5, max_files: int = 500) -> Dict[str, List[str]]: """ Find exact duplicate files by normalized content. Returns mapping: content_hash -> list of relative file paths (len>=2). """ base = Path(base_path or ".").resolve() files: List[Path] = [] seen: set[str] = set() for p in _iter_files(base, includes): # De-duplicate same path from overlapping globs rp = str(p.resolve()) if rp in seen: continue seen.add(rp) files.append(p) if len(files) >= max_files: break groups: Dict[str, List[str]] = {} for p in files: try: text = p.read_text(encoding="utf-8", errors="ignore") except Exception: continue # Count effective lines after normalization (ignoring trailing blanks) eff_lines = len(_split_norm_lines(text)) if text else 0 if eff_lines < max(1, int(min_lines or 1)): continue norm = _norm_content(text) h = hashlib.sha1(norm.encode("utf-8")).hexdigest() rel = os.path.relpath(str(p), str(base)) groups.setdefault(h, []).append(rel) # Keep only hashes with duplicates return {h: paths for h, paths in groups.items() if len(paths) >= 2}