diff --git a/GDPRArticles.pdf b/GDPRArticles.pdf new file mode 100644 index 0000000..554fd98 Binary files /dev/null and b/GDPRArticles.pdf differ diff --git a/api/routers/detect_pii.py b/api/routers/detect_pii.py index baeedc1..865907b 100644 --- a/api/routers/detect_pii.py +++ b/api/routers/detect_pii.py @@ -20,6 +20,8 @@ from data_cleaning.config import ( ENTITY_STRATEGY_MAP, STRATEGIES, GDPR_COMPLIANCE, + COLUMN_CONTEXT_FILTERS, + EXCLUSION_PATTERNS, get_strategy_for_entity, get_risk_level ) diff --git a/data_cleaning/cleaner.py b/data_cleaning/cleaner.py index 658b1b8..61e97f5 100644 --- a/data_cleaning/cleaner.py +++ b/data_cleaning/cleaner.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np import hashlib import json +import re from datetime import datetime from typing import Dict, List, Tuple, Optional, Any from collections import defaultdict @@ -375,10 +376,14 @@ class DataCleaner: ) -> Dict[str, List[Dict]]: """ Detect PII at column and cell level (GPU-accelerated when available) + With intelligent filtering for false positives Returns: Dictionary mapping column names to list of detected entities """ + import re + from data_cleaning.config import COLUMN_CONTEXT_FILTERS, EXCLUSION_PATTERNS + pii_detections = defaultdict(list) # Determine which columns to scan @@ -417,26 +422,69 @@ class DataCleaner: ) if results: - # Aggregate by entity type - entity_summary = defaultdict(lambda: {'count': 0, 'scores': []}) + # Aggregate by entity type with filtering + entity_summary = defaultdict(lambda: {'count': 0, 'scores': [], 'filtered': 0}) + filtered_reasons = [] for result in results: - entity_summary[result.entity_type]['count'] += 1 - entity_summary[result.entity_type]['scores'].append(result.score) + entity_type = result.entity_type + # Extract detected text from original string using start/end positions + detected_text = combined_text[result.start:result.end] + + # ✅ FILTER 1: Column Context Filtering + # Skip if entity type should be ignored based on column name + context_filtered = False + for pattern, ignored_entities in COLUMN_CONTEXT_FILTERS.items(): + if re.search(pattern, column.lower()) and entity_type in ignored_entities: + context_filtered = True + entity_summary[entity_type]['filtered'] += 1 + if f"column context ({pattern})" not in filtered_reasons: + filtered_reasons.append(f"column context ({pattern})") + break + + if context_filtered: + continue + + # ✅ FILTER 2: Value Pattern Exclusions + # Skip if detected value matches exclusion patterns + pattern_filtered = False + if entity_type in EXCLUSION_PATTERNS: + for exclusion_pattern in EXCLUSION_PATTERNS[entity_type]: + if re.match(exclusion_pattern, detected_text, re.IGNORECASE): + pattern_filtered = True + entity_summary[entity_type]['filtered'] += 1 + if f"value pattern ({exclusion_pattern[:20]}...)" not in filtered_reasons: + filtered_reasons.append(f"value pattern") + break + + if pattern_filtered: + continue + + # ✅ Not filtered - count as valid detection + entity_summary[entity_type]['count'] += 1 + entity_summary[entity_type]['scores'].append(result.score) - # Store detection results + # Store detection results (only non-filtered) + detected_types = [] for entity_type, info in entity_summary.items(): - avg_confidence = np.mean(info['scores']) - pii_detections[column].append({ - 'entity_type': entity_type, - 'count': info['count'], - 'avg_confidence': avg_confidence, - 'max_confidence': max(info['scores']), - 'min_confidence': min(info['scores']) - }) + if info['count'] > 0: # Only include if we have valid (non-filtered) detections + avg_confidence = np.mean(info['scores']) + pii_detections[column].append({ + 'entity_type': entity_type, + 'count': info['count'], + 'avg_confidence': avg_confidence, + 'max_confidence': max(info['scores']), + 'min_confidence': min(info['scores']) + }) + detected_types.append(entity_type) - detected_types = [d['entity_type'] for d in pii_detections[column]] - print(f"✓ Found: {', '.join(detected_types)}") + if detected_types: + print(f"✓ Found: {', '.join(detected_types)}") + elif any(info['filtered'] > 0 for info in entity_summary.values()): + total_filtered = sum(info['filtered'] for info in entity_summary.values()) + print(f"(filtered {total_filtered} false positives: {', '.join(filtered_reasons[:2])})") + else: + print("(no PII)") else: print("(no PII)") diff --git a/data_cleaning/config.py b/data_cleaning/config.py index f5944c5..a6ee24a 100644 --- a/data_cleaning/config.py +++ b/data_cleaning/config.py @@ -126,9 +126,63 @@ GDPR_COMPLIANCE = { # Presidio Analyzer Settings PRESIDIO_CONFIG = { 'language': 'en', - 'score_threshold': 0.5, # Minimum confidence to report + 'score_threshold': 0.6, # Minimum confidence to report (raised from 0.5 to reduce false positives) 'entities': None, # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER'] - 'allow_list': [], # Terms to ignore (e.g., company names that look like PII) + 'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'], # Common non-PII values +} + +# Column Context Filters - Ignore specific entity types based on column name patterns +# This prevents false positives when column names provide context +COLUMN_CONTEXT_FILTERS = { + # Column name pattern (regex) -> List of entity types to IGNORE in that column + r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'], + r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'], + r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'], + r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'], + r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*code.*': ['PERSON'], # Codes are rarely names + r'.*id$': ['PERSON'], # IDs ending in 'id' are rarely names + r'.*_id$': ['PERSON'], # Same for underscore_id +} + +# Value Pattern Exclusions - Ignore values matching these patterns for specific entity types +# This catches false positives based on the actual detected value format +EXCLUSION_PATTERNS = { + 'US_DRIVER_LICENSE': [ + r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits (e.g., l1, l2, A1, B12) + r'^[a-zA-Z]{1,2}$', # 1-2 letters only (e.g., A, AB) + r'^level\s*\d+$', # "level 1", "level 2", etc. + r'^tier\s*\d+$', # "tier 1", "tier 2", etc. + r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc. + ], + 'US_PASSPORT': [ + r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits + r'^[a-zA-Z]{1,2}$', # 1-2 letters only + ], + 'PERSON': [ + r'^(admin|user|guest|system|default|test|demo)$', # Generic usernames + r'^[a-zA-Z]\d*$', # Single letter with optional numbers (A, A1, B2) + r'^(yes|no|true|false|y|n|t|f)$', # Boolean values + r'^(male|female|m|f|other)$', # Gender categories + r'^(low|medium|high|good|bad|excellent|poor)$', # Rating values + ], + 'EMAIL_ADDRESS': [ + r'^(test|demo|example|sample)@', # Test emails + r'@(test|demo|example|sample)\.', # Test domains + ], + 'PHONE_NUMBER': [ + r'^(000|111|222|333|444|555|666|777|888|999)[-\s]', # Fake phone patterns + r'^1{6,}$', # All 1s + r'^0{6,}$', # All 0s + ], } # Custom Recognizers (domain-specific patterns) diff --git a/requirements.txt b/requirements.txt index b4e9a2c..4912a74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ python-multipart>=0.0.6 # torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121 # Chatbot (WIP - not exposed in API yet) -gpt4all>=2.0.0annotated-doc==0.0.3 +gpt4all>=2.0.0 annotated-types==0.7.0 anyio==4.11.0 blis==1.3.0