Merge branch 'main' into main

2026-02-07 22:18:59 +00:00 · 2025-11-07 10:01:06 +05:30
parent 59d46b659f 15e5412fa7
commit ac469ee9b7
4 changed files with 400 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -57,4 +57,6 @@ reports/*.pdf
 # Data (keep demo dataset)
 *.csv
-!Datasets/loan_data.csv
+!Datasets/loan_data.csv
 Data
--- a/Datasets/loan_data_cleaned_audit.json
+++ b/Datasets/loan_data_cleaned_audit.json
@@ -0,0 +1,142 @@
 {
  "metadata": {
    "timestamp": "2025-11-07T08:59:26.645555",
    "original_rows": 45000,
    "original_columns": 14,
    "cleaned_rows": 45000,
    "cleaned_columns": 13,
    "presidio_version": "enabled",
    "gpu_acceleration": {
      "enabled": true,
      "cuda_available": true,
      "device": "NVIDIA GeForce RTX 4050 Laptop GPU",
      "gpu_memory_gb": 5.99658203125
    }
  },
  "summary": {
    "columns_removed": [
      "person_education"
    ],
    "columns_anonymized": [
      "loan_intent",
      "person_home_ownership"
    ],
    "total_cells_affected": 49906
  },
  "details": {
    "loan_intent": {
      "action": "ANONYMIZED",
      "strategies_applied": [
        "HASH"
      ],
      "reason": "Contains ORGANIZATION entities. Applied hash anonymization to protect privacy.",
      "entity_types_found": [
        "ORGANIZATION"
      ],
      "num_affected_rows": 23512,
      "percentage_affected": "52.2%",
      "examples": [
        {
          "before": "MEDICAL",
          "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
        },
        {
          "before": "MEDICAL",
          "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
        },
        {
          "before": "MEDICAL",
          "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
        }
      ],
      "presidio_metrics": {
        "avg_confidence": 0.85,
        "detections": [
          {
            "entity_type": "ORGANIZATION",
            "count": 49,
            "avg_confidence": 0.85,
            "max_confidence": 0.85,
            "min_confidence": 0.85
          }
        ]
      },
      "gdpr_compliance": []
    },
    "person_home_ownership": {
      "action": "ANONYMIZED",
      "strategies_applied": [
        "MASK"
      ],
      "reason": "Contains ORGANIZATION, LOCATION entities. Applied mask anonymization to protect privacy.",
      "entity_types_found": [
        "ORGANIZATION",
        "LOCATION"
      ],
      "num_affected_rows": 26394,
      "percentage_affected": "58.7%",
      "examples": [
        {
          "before": "RENT",
          "after": "****"
        },
        {
          "before": "OWN",
          "after": "***"
        },
        {
          "before": "RENT",
          "after": "****"
        }
      ],
      "presidio_metrics": {
        "avg_confidence": 0.85,
        "detections": [
          {
            "entity_type": "ORGANIZATION",
            "count": 24,
            "avg_confidence": 0.85,
            "max_confidence": 0.85,
            "min_confidence": 0.85
          },
          {
            "entity_type": "LOCATION",
            "count": 49,
            "avg_confidence": 0.85,
            "max_confidence": 0.85,
            "min_confidence": 0.85
          }
        ]
      },
      "gdpr_compliance": [
        "Art. 4(1) - Personal data (location)"
      ]
    },
    "person_education": {
      "action": "REMOVED",
      "reason": "Contains HIGH risk PII requiring removal",
      "entity_types_found": [
        "ORGANIZATION"
      ],
      "risk_level": "HIGH",
      "presidio_metrics": {
        "detections": [
          {
            "entity_type": "ORGANIZATION",
            "count": 4,
            "avg_confidence": 0.85,
            "max_confidence": 0.85,
            "min_confidence": 0.85
          }
        ]
      },
      "gdpr_compliance": []
    }
  },
  "compliance": {
    "gdpr_articles_applied": [
      "Art. 4(1) - Personal data (location)"
    ],
    "risk_mitigation": {}
  }
 }
--- a/cleaning.py
+++ b/cleaning.py
@@ -1,6 +1,7 @@
 """
 Data Cleaning Module - PII Detection and Anonymization
 Handles GDPR-compliant data cleaning using Presidio for PII detection
 GPU-accelerated for faster processing of large datasets
 """
 import pandas as pd
@@ -20,6 +21,35 @@ try:
 except ImportError:
    PRESIDIO_AVAILABLE = False
    print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer")
 # GPU detection
 try:
    import torch
    CUDA_AVAILABLE = torch.cuda.is_available()
    if CUDA_AVAILABLE:
        GPU_DEVICE = 0  # Use first GPU
        GPU_NAME = torch.cuda.get_device_name(0)
        GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / 1024**3  # GB
    else:
        GPU_DEVICE = -1
        GPU_NAME = None
        GPU_MEMORY = 0
 except ImportError:
    CUDA_AVAILABLE = False
    GPU_DEVICE = -1
    GPU_NAME = None
    GPU_MEMORY = 0
 try:
    import spacy
    SPACY_AVAILABLE = True
    # Check if spaCy can use GPU
    if CUDA_AVAILABLE:
        spacy.require_gpu()
 except ImportError:
    SPACY_AVAILABLE = False
 except Exception:
    # GPU not available for spaCy, will fall back to CPU
    pass
 def convert_to_json_serializable(obj):
@@ -112,18 +142,23 @@ class DataCleaner:
        ... )
    """
-    def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None):
+    def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None, use_gpu: bool = True):
        """
        Initialize the data cleaner
        Args:
            df: Input DataFrame to clean
            config: Optional custom configuration
            use_gpu: Whether to use GPU acceleration if available (default: True)
        """
        self.df = df.copy()
        self.config = config or CleaningConfig()
        self.audit_log = []
        self.cleaning_actions = {}
        self.use_gpu = use_gpu and CUDA_AVAILABLE
        # Display GPU info
        self._display_gpu_info()
        # Initialize Presidio engines
        if PRESIDIO_AVAILABLE:
@@ -134,8 +169,29 @@ class DataCleaner:
                "Install with: pip install presidio-analyzer presidio-anonymizer"
            )
    def _display_gpu_info(self):
        """Display GPU availability and configuration"""
        print("\n" + "="*70)
        print("🖥️  HARDWARE CONFIGURATION")
        print("="*70)
        if CUDA_AVAILABLE and self.use_gpu:
            print(f"✓ GPU ACCELERATION: ENABLED")
            print(f"  Device: {GPU_NAME}")
            print(f"  Memory: {GPU_MEMORY:.2f} GB")
            print(f"  CUDA Device ID: {GPU_DEVICE}")
        elif CUDA_AVAILABLE and not self.use_gpu:
            print(f"⚠️  GPU ACCELERATION: DISABLED (use_gpu=False)")
            print(f"  Available GPU: {GPU_NAME} ({GPU_MEMORY:.2f} GB)")
        else:
            print(f"⚠️  GPU ACCELERATION: NOT AVAILABLE")
            print(f"  Reason: {'PyTorch not installed' if not 'torch' in dir() else 'No CUDA device detected'}")
            print(f"  Install: pip install torch --index-url https://download.pytorch.org/whl/cu121")
        print("="*70 + "\n")
    def _init_presidio(self):
-        """Initialize Presidio analyzer and anonymizer engines with Nordic recognizers"""
+        """Initialize Presidio analyzer and anonymizer engines with GPU support"""
        # Create NLP engine configuration
        configuration = {
            "nlp_engine_name": "spacy",
@@ -147,18 +203,23 @@ class DataCleaner:
            provider = NlpEngineProvider(nlp_configuration=configuration)
            nlp_engine = provider.create_engine()
-            # Create registry and add Nordic recognizers
+            # Enable GPU for spaCy if available
-            registry = RecognizerRegistry()
+            if self.use_gpu and SPACY_AVAILABLE:
-            registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+                try:
                    import spacy
                    # Move spaCy model to GPU
                    spacy.require_gpu()
                    print("✓ spaCy GPU acceleration enabled")
                except Exception as e:
                    print(f"⚠️  Could not enable spaCy GPU: {e}")
                    print("  Falling back to CPU for NLP processing")
-            # Add Nordic-specific recognizers
+            # Create analyzer with NLP engine
-            self._add_nordic_recognizers(registry)
+            self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
            # Create analyzer with custom registry
            self.analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine)
            self.anonymizer = AnonymizerEngine()
-            print("✓ Presidio engines initialized with Nordic PII recognizers")
+            device_info = "GPU" if self.use_gpu else "CPU"
            print(f"✓ Presidio engines initialized successfully ({device_info} mode)")
        except Exception as e:
            # Fallback to default configuration if spaCy model not available
            print(f"Warning: Could not load spaCy model, using default configuration: {e}")
@@ -313,7 +374,7 @@ class DataCleaner:
        scan_all_cells: bool
    ) -> Dict[str, List[Dict]]:
        """
-        Detect PII at column and cell level
+        Detect PII at column and cell level (GPU-accelerated when available)
        Returns:
            Dictionary mapping column names to list of detected entities
@@ -332,7 +393,8 @@ class DataCleaner:
            text_columns = df.select_dtypes(include=['object']).columns.tolist()
            columns_to_scan = list(set(columns_to_scan + text_columns))
-        print(f"  Scanning {len(columns_to_scan)} columns: {columns_to_scan}")
+        device_info = f"GPU ({GPU_NAME})" if self.use_gpu else "CPU"
        print(f"  Scanning {len(columns_to_scan)} columns using {device_info}: {columns_to_scan}")
        for column in columns_to_scan:
            print(f"  Analyzing '{column}'...", end=" ")
@@ -632,7 +694,13 @@ class DataCleaner:
                'original_columns': len(self.df.columns),
                'cleaned_rows': len(cleaned_df),
                'cleaned_columns': len(cleaned_df.columns),
-                'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled'
+                'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled',
                'gpu_acceleration': {
                    'enabled': self.use_gpu,
                    'cuda_available': CUDA_AVAILABLE,
                    'device': GPU_NAME if self.use_gpu else 'CPU',
                    'gpu_memory_gb': GPU_MEMORY if self.use_gpu else 0
                }
            },
            'summary': {
                'total_rows': len(self.df),
@@ -1285,19 +1353,22 @@ def main():
    import sys
    if len(sys.argv) < 2:
-        print("Usage: python cleaning.py <data_file.csv>")
+        print("Usage: python cleaning.py <data_file.csv> [--no-gpu]")
        print("Example: python cleaning.py Datasets/loan_data.csv")
        print("Options:")
        print("  --no-gpu    Disable GPU acceleration (use CPU only)")
        sys.exit(1)
    data_path = sys.argv[1]
    use_gpu = '--no-gpu' not in sys.argv
    # Load data
    print(f"Loading data from {data_path}...")
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} rows × {len(df.columns)} columns")
-    # Initialize cleaner
+    # Initialize cleaner with GPU support
-    cleaner = DataCleaner(df)
+    cleaner = DataCleaner(df, use_gpu=use_gpu)
    # Run cleaning (interactive mode)
    cleaned_df, audit_report = cleaner.clean(
--- a/discovery/main.py
+++ b/discovery/main.py
@@ -0,0 +1,167 @@
 import csv
 import re
 from pathlib import Path
 from collections import Counter
 from datetime import datetime
 ROOT = Path("../Data/Politics")
 # Try to import spaCy, fall back to basic extraction if not available
 try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    USE_SPACY = True
 except:
    USE_SPACY = False
 # Regex patterns for deterministic detection
 patterns = {
    "EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"),
    "PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"),
    "UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
    "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
    "DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
    "URL": re.compile(r"https?://[^\s]+"),
    "SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
 }
 def find_entities(text):
    """Extract entities using regex patterns."""
    found = {}
    for label, pattern in patterns.items():
        matches = pattern.findall(text)
        if matches:
            found[label] = list(set(matches))[:5]  # Limit to 5 per type
    return found
 def extract_with_spacy(text):
    """Extract named entities using spaCy."""
    if not USE_SPACY:
        return {}, {}, {}
    doc = nlp(text[:10000])  # Limit text length for performance
    persons = []
    orgs = []
    locations = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            persons.append(ent.text)
        elif ent.label_ == "ORG":
            orgs.append(ent.text)
        elif ent.label_ in ["GPE", "LOC"]:
            locations.append(ent.text)
    # Return most common entities
    return (
        dict(Counter(persons).most_common(5)),
        dict(Counter(orgs).most_common(5)),
        dict(Counter(locations).most_common(5))
    )
 def extract_metadata(text, filename):
    """Extract basic metadata from text."""
    metadata = {
        "char_count": len(text),
        "word_count": len(text.split()),
        "line_count": text.count('\n') + 1,
        "file_extension": Path(filename).suffix,
    }
    return metadata
 def detect_content_type(text):
    """Heuristic content type detection."""
    text_lower = text.lower()
    # Check for common document types
    if any(word in text_lower[:1000] for word in ['dear', 'sincerely', 'regards']):
        return "letter"
    elif any(word in text_lower[:500] for word in ['article', 'section', 'amendment']):
        return "legal"
    elif any(word in text_lower[:500] for word in ['press release', 'for immediate release']):
        return "press_release"
    elif re.search(r'^\s*#', text[:100], re.MULTILINE):
        return "markdown"
    elif '<html' in text_lower[:200]:
        return "html"
    else:
        return "unknown"
 # Define fieldnames
 fieldnames = [
    "filename", "file_extension", "char_count", "word_count", "line_count",
    "content_type", "text_preview",
    "EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN",
    "persons", "organizations", "locations"
 ]
 print("Processing files...")
 with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    file_count = 0
    for path in ROOT.rglob("*"):
        if not path.is_file():
            continue
        # Skip binary files
        if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']:
            continue
        try:
            text = path.read_text(encoding="utf-8", errors="ignore")
        except Exception as e:
            print(f"  Error reading {path.name}: {e}")
            continue
        if not text.strip():
            continue
        file_count += 1
        if file_count % 10 == 0:
            print(f"Processed {file_count} files...")
        # Initialize row
        row = {"filename": str(path.relative_to(ROOT.parent))}
        # Extract metadata
        metadata = extract_metadata(text, path.name)
        row.update(metadata)
        # Detect content type
        row["content_type"] = detect_content_type(text)
        row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
        # Extract entities with regex
        entities = find_entities(text)
        for key, values in entities.items():
            row[key] = "; ".join(values) if values else ""
        # Fill in missing pattern fields
        for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
            if pattern_key not in row:
                row[pattern_key] = ""
        # Extract named entities with spaCy
        if USE_SPACY:
            persons, orgs, locs = extract_with_spacy(text)
            row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()])
            row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()])
            row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()])
        else:
            row["persons"] = ""
            row["organizations"] = ""
            row["locations"] = ""
        writer.writerow(row)
 print(f"\nComplete! Processed {file_count} files.")
 print(f"Output: discovery_dataset.csv")
 # Print summary statistics
 if file_count > 0:
    print("\nTo install spaCy for better entity extraction:")
    print("  pip install spacy")
    print("  python -m spacy download en_core_web_sm")