From 7e3938d555e32e7527c28869a458dc88febd6b9b Mon Sep 17 00:00:00 2001 From: nearlynithin Date: Fri, 7 Nov 2025 04:32:02 +0530 Subject: [PATCH 1/5] feat: regex data sanitization --- discovery/main.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 discovery/main.py diff --git a/discovery/main.py b/discovery/main.py new file mode 100644 index 0000000..03715b2 --- /dev/null +++ b/discovery/main.py @@ -0,0 +1,44 @@ +import csv +import re +from pathlib import Path + +ROOT = Path("../../archiv/Data/Politics") + +email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") +phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}") +ssn_re = re.compile(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b") +uuid_re = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b") +pan_re = re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b") +iban_re = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b") + +patterns = { + "EMAIL": email_re, + "PHONE": phone_re, + "SSN": ssn_re, + "UUID": uuid_re, + "PAN": pan_re, + "IBAN": iban_re, +} + +def find_entities(text): + found = [] + for label, pattern in patterns.items(): + for m in pattern.finditer(text): + found.append(f"{label}: {m.group(0)}") + return found + +with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile: + writer = csv.writer(csvfile) + writer.writerow(["filename", "text", "detected_entities"]) + + for path in ROOT.rglob("*"): + if not path.is_file(): + continue + try: + text = path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + entities = find_entities(text) + if text.strip(): # skip empty + writer.writerow([str(path), text[:5000], "; ".join(entities)]) # limit length if huge From 747e2df066c019d814395661c80e52b795bc0259 Mon Sep 17 00:00:00 2001 From: PlatypusPus <23h46.shovin@sjec.ac.in> Date: Fri, 7 Nov 2025 09:28:18 +0530 Subject: [PATCH 2/5] feat:Added CUDA --- cleaning.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 9 deletions(-) diff --git a/cleaning.py b/cleaning.py index f97eac6..a83d0da 100644 --- a/cleaning.py +++ b/cleaning.py @@ -1,6 +1,7 @@ """ Data Cleaning Module - PII Detection and Anonymization Handles GDPR-compliant data cleaning using Presidio for PII detection +GPU-accelerated for faster processing of large datasets """ import pandas as pd @@ -20,6 +21,36 @@ try: except ImportError: PRESIDIO_AVAILABLE = False print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer") +w ad +# GPU detection +try: + import torch + CUDA_AVAILABLE = torch.cuda.is_available() + if CUDA_AVAILABLE: + GPU_DEVICE = 0 # Use first GPU + GPU_NAME = torch.cuda.get_device_name(0) + GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / 1024**3 # GB + else: + GPU_DEVICE = -1 + GPU_NAME = None + GPU_MEMORY = 0 +except ImportError: + CUDA_AVAILABLE = False + GPU_DEVICE = -1 + GPU_NAME = None + GPU_MEMORY = 0 + +try: + import spacy + SPACY_AVAILABLE = True + # Check if spaCy can use GPU + if CUDA_AVAILABLE: + spacy.require_gpu() +except ImportError: + SPACY_AVAILABLE = False +except Exception: + # GPU not available for spaCy, will fall back to CPU + pass class CleaningConfig: @@ -97,18 +128,23 @@ class DataCleaner: ... ) """ - def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None): + def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None, use_gpu: bool = True): """ Initialize the data cleaner Args: df: Input DataFrame to clean config: Optional custom configuration + use_gpu: Whether to use GPU acceleration if available (default: True) """ self.df = df.copy() self.config = config or CleaningConfig() self.audit_log = [] self.cleaning_actions = {} + self.use_gpu = use_gpu and CUDA_AVAILABLE + + # Display GPU info + self._display_gpu_info() # Initialize Presidio engines if PRESIDIO_AVAILABLE: @@ -119,8 +155,29 @@ class DataCleaner: "Install with: pip install presidio-analyzer presidio-anonymizer" ) + def _display_gpu_info(self): + """Display GPU availability and configuration""" + print("\n" + "="*70) + print("🖥️ HARDWARE CONFIGURATION") + print("="*70) + + if CUDA_AVAILABLE and self.use_gpu: + print(f"✓ GPU ACCELERATION: ENABLED") + print(f" Device: {GPU_NAME}") + print(f" Memory: {GPU_MEMORY:.2f} GB") + print(f" CUDA Device ID: {GPU_DEVICE}") + elif CUDA_AVAILABLE and not self.use_gpu: + print(f"⚠️ GPU ACCELERATION: DISABLED (use_gpu=False)") + print(f" Available GPU: {GPU_NAME} ({GPU_MEMORY:.2f} GB)") + else: + print(f"⚠️ GPU ACCELERATION: NOT AVAILABLE") + print(f" Reason: {'PyTorch not installed' if not 'torch' in dir() else 'No CUDA device detected'}") + print(f" Install: pip install torch --index-url https://download.pytorch.org/whl/cu121") + + print("="*70 + "\n") + def _init_presidio(self): - """Initialize Presidio analyzer and anonymizer engines""" + """Initialize Presidio analyzer and anonymizer engines with GPU support""" # Create NLP engine configuration configuration = { "nlp_engine_name": "spacy", @@ -132,11 +189,23 @@ class DataCleaner: provider = NlpEngineProvider(nlp_configuration=configuration) nlp_engine = provider.create_engine() + # Enable GPU for spaCy if available + if self.use_gpu and SPACY_AVAILABLE: + try: + import spacy + # Move spaCy model to GPU + spacy.require_gpu() + print("✓ spaCy GPU acceleration enabled") + except Exception as e: + print(f"⚠️ Could not enable spaCy GPU: {e}") + print(" Falling back to CPU for NLP processing") + # Create analyzer with NLP engine self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine) self.anonymizer = AnonymizerEngine() - print("✓ Presidio engines initialized successfully") + device_info = "GPU" if self.use_gpu else "CPU" + print(f"✓ Presidio engines initialized successfully ({device_info} mode)") except Exception as e: # Fallback to default configuration if spaCy model not available print(f"Warning: Could not load spaCy model, using default configuration: {e}") @@ -207,7 +276,7 @@ class DataCleaner: scan_all_cells: bool ) -> Dict[str, List[Dict]]: """ - Detect PII at column and cell level + Detect PII at column and cell level (GPU-accelerated when available) Returns: Dictionary mapping column names to list of detected entities @@ -226,7 +295,8 @@ class DataCleaner: text_columns = df.select_dtypes(include=['object']).columns.tolist() columns_to_scan = list(set(columns_to_scan + text_columns)) - print(f" Scanning {len(columns_to_scan)} columns: {columns_to_scan}") + device_info = f"GPU ({GPU_NAME})" if self.use_gpu else "CPU" + print(f" Scanning {len(columns_to_scan)} columns using {device_info}: {columns_to_scan}") for column in columns_to_scan: print(f" Analyzing '{column}'...", end=" ") @@ -526,7 +596,13 @@ class DataCleaner: 'original_columns': len(self.df.columns), 'cleaned_rows': len(cleaned_df), 'cleaned_columns': len(cleaned_df.columns), - 'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled' + 'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled', + 'gpu_acceleration': { + 'enabled': self.use_gpu, + 'cuda_available': CUDA_AVAILABLE, + 'device': GPU_NAME if self.use_gpu else 'CPU', + 'gpu_memory_gb': GPU_MEMORY if self.use_gpu else 0 + } }, 'summary': { 'columns_removed': [], @@ -706,19 +782,22 @@ def main(): import sys if len(sys.argv) < 2: - print("Usage: python cleaning.py ") + print("Usage: python cleaning.py [--no-gpu]") print("Example: python cleaning.py Datasets/loan_data.csv") + print("Options:") + print(" --no-gpu Disable GPU acceleration (use CPU only)") sys.exit(1) data_path = sys.argv[1] + use_gpu = '--no-gpu' not in sys.argv # Load data print(f"Loading data from {data_path}...") df = pd.read_csv(data_path) print(f"Loaded {len(df)} rows × {len(df.columns)} columns") - # Initialize cleaner - cleaner = DataCleaner(df) + # Initialize cleaner with GPU support + cleaner = DataCleaner(df, use_gpu=use_gpu) # Run cleaning (interactive mode) cleaned_df, audit_report = cleaner.clean( From f1af97f7c9bafacb353a6986e8d3a68d1c9d37e9 Mon Sep 17 00:00:00 2001 From: Chaithz Date: Fri, 7 Nov 2025 09:30:45 +0530 Subject: [PATCH 3/5] feat: csv generator --- discovery/main.py | 169 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 146 insertions(+), 23 deletions(-) diff --git a/discovery/main.py b/discovery/main.py index 03715b2..96fa8e7 100644 --- a/discovery/main.py +++ b/discovery/main.py @@ -1,44 +1,167 @@ import csv import re from pathlib import Path +from collections import Counter +from datetime import datetime -ROOT = Path("../../archiv/Data/Politics") +ROOT = Path("../Data/Politics") -email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") -phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}") -ssn_re = re.compile(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b") -uuid_re = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b") -pan_re = re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b") -iban_re = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b") +# Try to import spaCy, fall back to basic extraction if not available +try: + import spacy + nlp = spacy.load("en_core_web_sm") + USE_SPACY = True +except: + USE_SPACY = False +# Regex patterns for deterministic detection patterns = { - "EMAIL": email_re, - "PHONE": phone_re, - "SSN": ssn_re, - "UUID": uuid_re, - "PAN": pan_re, - "IBAN": iban_re, + "EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"), + "PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"), + "UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"), + "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"), + "DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"), + "URL": re.compile(r"https?://[^\s]+"), + "SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), } def find_entities(text): - found = [] + """Extract entities using regex patterns.""" + found = {} for label, pattern in patterns.items(): - for m in pattern.finditer(text): - found.append(f"{label}: {m.group(0)}") + matches = pattern.findall(text) + if matches: + found[label] = list(set(matches))[:5] # Limit to 5 per type return found -with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile: - writer = csv.writer(csvfile) - writer.writerow(["filename", "text", "detected_entities"]) +def extract_with_spacy(text): + """Extract named entities using spaCy.""" + if not USE_SPACY: + return {}, {}, {} + + doc = nlp(text[:10000]) # Limit text length for performance + + persons = [] + orgs = [] + locations = [] + + for ent in doc.ents: + if ent.label_ == "PERSON": + persons.append(ent.text) + elif ent.label_ == "ORG": + orgs.append(ent.text) + elif ent.label_ in ["GPE", "LOC"]: + locations.append(ent.text) + + # Return most common entities + return ( + dict(Counter(persons).most_common(5)), + dict(Counter(orgs).most_common(5)), + dict(Counter(locations).most_common(5)) + ) +def extract_metadata(text, filename): + """Extract basic metadata from text.""" + metadata = { + "char_count": len(text), + "word_count": len(text.split()), + "line_count": text.count('\n') + 1, + "file_extension": Path(filename).suffix, + } + return metadata + +def detect_content_type(text): + """Heuristic content type detection.""" + text_lower = text.lower() + + # Check for common document types + if any(word in text_lower[:1000] for word in ['dear', 'sincerely', 'regards']): + return "letter" + elif any(word in text_lower[:500] for word in ['article', 'section', 'amendment']): + return "legal" + elif any(word in text_lower[:500] for word in ['press release', 'for immediate release']): + return "press_release" + elif re.search(r'^\s*#', text[:100], re.MULTILINE): + return "markdown" + elif ' 0: + print("\nTo install spaCy for better entity extraction:") + print(" pip install spacy") + print(" python -m spacy download en_core_web_sm") \ No newline at end of file From 8ead47d54daa05e167e7db58abc5412bb3fcd920 Mon Sep 17 00:00:00 2001 From: Chaithz Date: Fri, 7 Nov 2025 09:32:16 +0530 Subject: [PATCH 4/5] ignore data dir --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d6f6a5a..a637d67 100644 --- a/.gitignore +++ b/.gitignore @@ -56,4 +56,6 @@ reports/*.pdf # Data (keep demo dataset) *.csv -!Datasets/loan_data.csv \ No newline at end of file +!Datasets/loan_data.csv + +Data \ No newline at end of file From d4c84c30fcb4038f7a54d3525af8c93782370c31 Mon Sep 17 00:00:00 2001 From: PlatypusPus <23h46.shovin@sjec.ac.in> Date: Fri, 7 Nov 2025 09:43:42 +0530 Subject: [PATCH 5/5] Ref:Fixed Things --- Datasets/loan_data_cleaned_audit.json | 142 ++++++++++++++++++++++++++ cleaning.py | 1 - 2 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 Datasets/loan_data_cleaned_audit.json diff --git a/Datasets/loan_data_cleaned_audit.json b/Datasets/loan_data_cleaned_audit.json new file mode 100644 index 0000000..300d98b --- /dev/null +++ b/Datasets/loan_data_cleaned_audit.json @@ -0,0 +1,142 @@ +{ + "metadata": { + "timestamp": "2025-11-07T08:59:26.645555", + "original_rows": 45000, + "original_columns": 14, + "cleaned_rows": 45000, + "cleaned_columns": 13, + "presidio_version": "enabled", + "gpu_acceleration": { + "enabled": true, + "cuda_available": true, + "device": "NVIDIA GeForce RTX 4050 Laptop GPU", + "gpu_memory_gb": 5.99658203125 + } + }, + "summary": { + "columns_removed": [ + "person_education" + ], + "columns_anonymized": [ + "loan_intent", + "person_home_ownership" + ], + "total_cells_affected": 49906 + }, + "details": { + "loan_intent": { + "action": "ANONYMIZED", + "strategies_applied": [ + "HASH" + ], + "reason": "Contains ORGANIZATION entities. Applied hash anonymization to protect privacy.", + "entity_types_found": [ + "ORGANIZATION" + ], + "num_affected_rows": 23512, + "percentage_affected": "52.2%", + "examples": [ + { + "before": "MEDICAL", + "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b" + }, + { + "before": "MEDICAL", + "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b" + }, + { + "before": "MEDICAL", + "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b" + } + ], + "presidio_metrics": { + "avg_confidence": 0.85, + "detections": [ + { + "entity_type": "ORGANIZATION", + "count": 49, + "avg_confidence": 0.85, + "max_confidence": 0.85, + "min_confidence": 0.85 + } + ] + }, + "gdpr_compliance": [] + }, + "person_home_ownership": { + "action": "ANONYMIZED", + "strategies_applied": [ + "MASK" + ], + "reason": "Contains ORGANIZATION, LOCATION entities. Applied mask anonymization to protect privacy.", + "entity_types_found": [ + "ORGANIZATION", + "LOCATION" + ], + "num_affected_rows": 26394, + "percentage_affected": "58.7%", + "examples": [ + { + "before": "RENT", + "after": "****" + }, + { + "before": "OWN", + "after": "***" + }, + { + "before": "RENT", + "after": "****" + } + ], + "presidio_metrics": { + "avg_confidence": 0.85, + "detections": [ + { + "entity_type": "ORGANIZATION", + "count": 24, + "avg_confidence": 0.85, + "max_confidence": 0.85, + "min_confidence": 0.85 + }, + { + "entity_type": "LOCATION", + "count": 49, + "avg_confidence": 0.85, + "max_confidence": 0.85, + "min_confidence": 0.85 + } + ] + }, + "gdpr_compliance": [ + "Art. 4(1) - Personal data (location)" + ] + }, + "person_education": { + "action": "REMOVED", + "reason": "Contains HIGH risk PII requiring removal", + "entity_types_found": [ + "ORGANIZATION" + ], + "risk_level": "HIGH", + "presidio_metrics": { + "detections": [ + { + "entity_type": "ORGANIZATION", + "count": 4, + "avg_confidence": 0.85, + "max_confidence": 0.85, + "min_confidence": 0.85 + } + ] + }, + "gdpr_compliance": [] + } + }, + "compliance": { + "gdpr_articles_applied": [ + "Art. 4(1) - Personal data (location)" + ], + "risk_mitigation": {} + } +} \ No newline at end of file diff --git a/cleaning.py b/cleaning.py index a83d0da..127701a 100644 --- a/cleaning.py +++ b/cleaning.py @@ -21,7 +21,6 @@ try: except ImportError: PRESIDIO_AVAILABLE = False print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer") -w ad # GPU detection try: import torch