From 7e3938d555e32e7527c28869a458dc88febd6b9b Mon Sep 17 00:00:00 2001
From: nearlynithin <nearlynithin@gmail.com>
Date: Fri, 7 Nov 2025 04:32:02 +0530
Subject: [PATCH 1/5] feat: regex data sanitization

---
 discovery/main.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 discovery/main.py

diff --git a/discovery/main.py b/discovery/main.py
new file mode 100644
index 0000000..03715b2
--- /dev/null
+++ b/discovery/main.py
@@ -0,0 +1,44 @@
+import csv
+import re
+from pathlib import Path
+
+ROOT = Path("../../archiv/Data/Politics")
+
+email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
+phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}")
+ssn_re = re.compile(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b")
+uuid_re = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b")
+pan_re = re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b")
+iban_re = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b")
+
+patterns = {
+    "EMAIL": email_re,
+    "PHONE": phone_re,
+    "SSN": ssn_re,
+    "UUID": uuid_re,
+    "PAN": pan_re,
+    "IBAN": iban_re,
+}
+
+def find_entities(text):
+    found = []
+    for label, pattern in patterns.items():
+        for m in pattern.finditer(text):
+            found.append(f"{label}: {m.group(0)}")
+    return found
+
+with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
+    writer = csv.writer(csvfile)
+    writer.writerow(["filename", "text", "detected_entities"])
+
+    for path in ROOT.rglob("*"):
+        if not path.is_file():
+            continue
+        try:
+            text = path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+        entities = find_entities(text)
+        if text.strip():  # skip empty
+            writer.writerow([str(path), text[:5000], "; ".join(entities)])  # limit length if huge

From 747e2df066c019d814395661c80e52b795bc0259 Mon Sep 17 00:00:00 2001
From: PlatypusPus <23h46.shovin@sjec.ac.in>
Date: Fri, 7 Nov 2025 09:28:18 +0530
Subject: [PATCH 2/5] feat:Added CUDA

---
 cleaning.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 9 deletions(-)

diff --git a/cleaning.py b/cleaning.py
index f97eac6..a83d0da 100644
--- a/cleaning.py
+++ b/cleaning.py
@@ -1,6 +1,7 @@
 """
 Data Cleaning Module - PII Detection and Anonymization
 Handles GDPR-compliant data cleaning using Presidio for PII detection
+GPU-accelerated for faster processing of large datasets
 """
 
 import pandas as pd
@@ -20,6 +21,36 @@ try:
 except ImportError:
     PRESIDIO_AVAILABLE = False
     print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer")
+w    ad 
+# GPU detection
+try:
+    import torch
+    CUDA_AVAILABLE = torch.cuda.is_available()
+    if CUDA_AVAILABLE:
+        GPU_DEVICE = 0  # Use first GPU
+        GPU_NAME = torch.cuda.get_device_name(0)
+        GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / 1024**3  # GB
+    else:
+        GPU_DEVICE = -1
+        GPU_NAME = None
+        GPU_MEMORY = 0
+except ImportError:
+    CUDA_AVAILABLE = False
+    GPU_DEVICE = -1
+    GPU_NAME = None
+    GPU_MEMORY = 0
+
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+    # Check if spaCy can use GPU
+    if CUDA_AVAILABLE:
+        spacy.require_gpu()
+except ImportError:
+    SPACY_AVAILABLE = False
+except Exception:
+    # GPU not available for spaCy, will fall back to CPU
+    pass
 
 
 class CleaningConfig:
@@ -97,18 +128,23 @@ class DataCleaner:
         ... )
     """
     
-    def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None):
+    def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None, use_gpu: bool = True):
         """
         Initialize the data cleaner
         
         Args:
             df: Input DataFrame to clean
             config: Optional custom configuration
+            use_gpu: Whether to use GPU acceleration if available (default: True)
         """
         self.df = df.copy()
         self.config = config or CleaningConfig()
         self.audit_log = []
         self.cleaning_actions = {}
+        self.use_gpu = use_gpu and CUDA_AVAILABLE
+        
+        # Display GPU info
+        self._display_gpu_info()
         
         # Initialize Presidio engines
         if PRESIDIO_AVAILABLE:
@@ -119,8 +155,29 @@ class DataCleaner:
                 "Install with: pip install presidio-analyzer presidio-anonymizer"
             )
     
+    def _display_gpu_info(self):
+        """Display GPU availability and configuration"""
+        print("\n" + "="*70)
+        print("🖥️  HARDWARE CONFIGURATION")
+        print("="*70)
+        
+        if CUDA_AVAILABLE and self.use_gpu:
+            print(f"✓ GPU ACCELERATION: ENABLED")
+            print(f"  Device: {GPU_NAME}")
+            print(f"  Memory: {GPU_MEMORY:.2f} GB")
+            print(f"  CUDA Device ID: {GPU_DEVICE}")
+        elif CUDA_AVAILABLE and not self.use_gpu:
+            print(f"⚠️  GPU ACCELERATION: DISABLED (use_gpu=False)")
+            print(f"  Available GPU: {GPU_NAME} ({GPU_MEMORY:.2f} GB)")
+        else:
+            print(f"⚠️  GPU ACCELERATION: NOT AVAILABLE")
+            print(f"  Reason: {'PyTorch not installed' if not 'torch' in dir() else 'No CUDA device detected'}")
+            print(f"  Install: pip install torch --index-url https://download.pytorch.org/whl/cu121")
+        
+        print("="*70 + "\n")
+    
     def _init_presidio(self):
-        """Initialize Presidio analyzer and anonymizer engines"""
+        """Initialize Presidio analyzer and anonymizer engines with GPU support"""
         # Create NLP engine configuration
         configuration = {
             "nlp_engine_name": "spacy",
@@ -132,11 +189,23 @@ class DataCleaner:
             provider = NlpEngineProvider(nlp_configuration=configuration)
             nlp_engine = provider.create_engine()
             
+            # Enable GPU for spaCy if available
+            if self.use_gpu and SPACY_AVAILABLE:
+                try:
+                    import spacy
+                    # Move spaCy model to GPU
+                    spacy.require_gpu()
+                    print("✓ spaCy GPU acceleration enabled")
+                except Exception as e:
+                    print(f"⚠️  Could not enable spaCy GPU: {e}")
+                    print("  Falling back to CPU for NLP processing")
+            
             # Create analyzer with NLP engine
             self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
             self.anonymizer = AnonymizerEngine()
             
-            print("✓ Presidio engines initialized successfully")
+            device_info = "GPU" if self.use_gpu else "CPU"
+            print(f"✓ Presidio engines initialized successfully ({device_info} mode)")
         except Exception as e:
             # Fallback to default configuration if spaCy model not available
             print(f"Warning: Could not load spaCy model, using default configuration: {e}")
@@ -207,7 +276,7 @@ class DataCleaner:
         scan_all_cells: bool
     ) -> Dict[str, List[Dict]]:
         """
-        Detect PII at column and cell level
+        Detect PII at column and cell level (GPU-accelerated when available)
         
         Returns:
             Dictionary mapping column names to list of detected entities
@@ -226,7 +295,8 @@ class DataCleaner:
             text_columns = df.select_dtypes(include=['object']).columns.tolist()
             columns_to_scan = list(set(columns_to_scan + text_columns))
         
-        print(f"  Scanning {len(columns_to_scan)} columns: {columns_to_scan}")
+        device_info = f"GPU ({GPU_NAME})" if self.use_gpu else "CPU"
+        print(f"  Scanning {len(columns_to_scan)} columns using {device_info}: {columns_to_scan}")
         
         for column in columns_to_scan:
             print(f"  Analyzing '{column}'...", end=" ")
@@ -526,7 +596,13 @@ class DataCleaner:
                 'original_columns': len(self.df.columns),
                 'cleaned_rows': len(cleaned_df),
                 'cleaned_columns': len(cleaned_df.columns),
-                'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled'
+                'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled',
+                'gpu_acceleration': {
+                    'enabled': self.use_gpu,
+                    'cuda_available': CUDA_AVAILABLE,
+                    'device': GPU_NAME if self.use_gpu else 'CPU',
+                    'gpu_memory_gb': GPU_MEMORY if self.use_gpu else 0
+                }
             },
             'summary': {
                 'columns_removed': [],
@@ -706,19 +782,22 @@ def main():
     import sys
     
     if len(sys.argv) < 2:
-        print("Usage: python cleaning.py <data_file.csv>")
+        print("Usage: python cleaning.py <data_file.csv> [--no-gpu]")
         print("Example: python cleaning.py Datasets/loan_data.csv")
+        print("Options:")
+        print("  --no-gpu    Disable GPU acceleration (use CPU only)")
         sys.exit(1)
     
     data_path = sys.argv[1]
+    use_gpu = '--no-gpu' not in sys.argv
     
     # Load data
     print(f"Loading data from {data_path}...")
     df = pd.read_csv(data_path)
     print(f"Loaded {len(df)} rows × {len(df.columns)} columns")
     
-    # Initialize cleaner
-    cleaner = DataCleaner(df)
+    # Initialize cleaner with GPU support
+    cleaner = DataCleaner(df, use_gpu=use_gpu)
     
     # Run cleaning (interactive mode)
     cleaned_df, audit_report = cleaner.clean(

From f1af97f7c9bafacb353a6986e8d3a68d1c9d37e9 Mon Sep 17 00:00:00 2001
From: Chaithz <chaithanyakulal22@gmail.com>
Date: Fri, 7 Nov 2025 09:30:45 +0530
Subject: [PATCH 3/5] feat: csv generator

---
 discovery/main.py | 169 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 146 insertions(+), 23 deletions(-)

diff --git a/discovery/main.py b/discovery/main.py
index 03715b2..96fa8e7 100644
--- a/discovery/main.py
+++ b/discovery/main.py
@@ -1,44 +1,167 @@
 import csv
 import re
 from pathlib import Path
+from collections import Counter
+from datetime import datetime
 
-ROOT = Path("../../archiv/Data/Politics")
+ROOT = Path("../Data/Politics")
 
-email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
-phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}")
-ssn_re = re.compile(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b")
-uuid_re = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b")
-pan_re = re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b")
-iban_re = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b")
+# Try to import spaCy, fall back to basic extraction if not available
+try:
+    import spacy
+    nlp = spacy.load("en_core_web_sm")
+    USE_SPACY = True
+except:
+    USE_SPACY = False
 
+# Regex patterns for deterministic detection
 patterns = {
-    "EMAIL": email_re,
-    "PHONE": phone_re,
-    "SSN": ssn_re,
-    "UUID": uuid_re,
-    "PAN": pan_re,
-    "IBAN": iban_re,
+    "EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"),
+    "PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"),
+    "UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
+    "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
+    "DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
+    "URL": re.compile(r"https?://[^\s]+"),
+    "SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
 }
 
 def find_entities(text):
-    found = []
+    """Extract entities using regex patterns."""
+    found = {}
     for label, pattern in patterns.items():
-        for m in pattern.finditer(text):
-            found.append(f"{label}: {m.group(0)}")
+        matches = pattern.findall(text)
+        if matches:
+            found[label] = list(set(matches))[:5]  # Limit to 5 per type
     return found
 
-with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
-    writer = csv.writer(csvfile)
-    writer.writerow(["filename", "text", "detected_entities"])
+def extract_with_spacy(text):
+    """Extract named entities using spaCy."""
+    if not USE_SPACY:
+        return {}, {}, {}
+    
+    doc = nlp(text[:10000])  # Limit text length for performance
+    
+    persons = []
+    orgs = []
+    locations = []
+    
+    for ent in doc.ents:
+        if ent.label_ == "PERSON":
+            persons.append(ent.text)
+        elif ent.label_ == "ORG":
+            orgs.append(ent.text)
+        elif ent.label_ in ["GPE", "LOC"]:
+            locations.append(ent.text)
+    
+    # Return most common entities
+    return (
+        dict(Counter(persons).most_common(5)),
+        dict(Counter(orgs).most_common(5)),
+        dict(Counter(locations).most_common(5))
+    )
 
+def extract_metadata(text, filename):
+    """Extract basic metadata from text."""
+    metadata = {
+        "char_count": len(text),
+        "word_count": len(text.split()),
+        "line_count": text.count('\n') + 1,
+        "file_extension": Path(filename).suffix,
+    }
+    return metadata
+
+def detect_content_type(text):
+    """Heuristic content type detection."""
+    text_lower = text.lower()
+    
+    # Check for common document types
+    if any(word in text_lower[:1000] for word in ['dear', 'sincerely', 'regards']):
+        return "letter"
+    elif any(word in text_lower[:500] for word in ['article', 'section', 'amendment']):
+        return "legal"
+    elif any(word in text_lower[:500] for word in ['press release', 'for immediate release']):
+        return "press_release"
+    elif re.search(r'^\s*#', text[:100], re.MULTILINE):
+        return "markdown"
+    elif '<html' in text_lower[:200]:
+        return "html"
+    else:
+        return "unknown"
+
+# Define fieldnames
+fieldnames = [
+    "filename", "file_extension", "char_count", "word_count", "line_count",
+    "content_type", "text_preview",
+    "EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN",
+    "persons", "organizations", "locations"
+]
+
+print("Processing files...")
+with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
+    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+    writer.writeheader()
+    
+    file_count = 0
     for path in ROOT.rglob("*"):
         if not path.is_file():
             continue
+        
+        # Skip binary files
+        if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']:
+            continue
+        
         try:
             text = path.read_text(encoding="utf-8", errors="ignore")
-        except Exception:
+        except Exception as e:
+            print(f"  Error reading {path.name}: {e}")
             continue
-
+        
+        if not text.strip():
+            continue
+        
+        file_count += 1
+        if file_count % 10 == 0:
+            print(f"Processed {file_count} files...")
+        
+        # Initialize row
+        row = {"filename": str(path.relative_to(ROOT.parent))}
+        
+        # Extract metadata
+        metadata = extract_metadata(text, path.name)
+        row.update(metadata)
+        
+        # Detect content type
+        row["content_type"] = detect_content_type(text)
+        row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
+        
+        # Extract entities with regex
         entities = find_entities(text)
-        if text.strip():  # skip empty
-            writer.writerow([str(path), text[:5000], "; ".join(entities)])  # limit length if huge
+        for key, values in entities.items():
+            row[key] = "; ".join(values) if values else ""
+        
+        # Fill in missing pattern fields
+        for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
+            if pattern_key not in row:
+                row[pattern_key] = ""
+        
+        # Extract named entities with spaCy
+        if USE_SPACY:
+            persons, orgs, locs = extract_with_spacy(text)
+            row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()])
+            row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()])
+            row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()])
+        else:
+            row["persons"] = ""
+            row["organizations"] = ""
+            row["locations"] = ""
+        
+        writer.writerow(row)
+
+print(f"\nComplete! Processed {file_count} files.")
+print(f"Output: discovery_dataset.csv")
+
+# Print summary statistics
+if file_count > 0:
+    print("\nTo install spaCy for better entity extraction:")
+    print("  pip install spacy")
+    print("  python -m spacy download en_core_web_sm")
\ No newline at end of file

From 8ead47d54daa05e167e7db58abc5412bb3fcd920 Mon Sep 17 00:00:00 2001
From: Chaithz <chaithanyakulal22@gmail.com>
Date: Fri, 7 Nov 2025 09:32:16 +0530
Subject: [PATCH 4/5] ignore data dir

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index d6f6a5a..a637d67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,4 +56,6 @@ reports/*.pdf
 
 # Data (keep demo dataset)
 *.csv
-!Datasets/loan_data.csv
\ No newline at end of file
+!Datasets/loan_data.csv
+
+Data
\ No newline at end of file

From d4c84c30fcb4038f7a54d3525af8c93782370c31 Mon Sep 17 00:00:00 2001
From: PlatypusPus <23h46.shovin@sjec.ac.in>
Date: Fri, 7 Nov 2025 09:43:42 +0530
Subject: [PATCH 5/5] Ref:Fixed Things

---
 Datasets/loan_data_cleaned_audit.json | 142 ++++++++++++++++++++++++++
 cleaning.py                           |   1 -
 2 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 Datasets/loan_data_cleaned_audit.json

diff --git a/Datasets/loan_data_cleaned_audit.json b/Datasets/loan_data_cleaned_audit.json
new file mode 100644
index 0000000..300d98b
--- /dev/null
+++ b/Datasets/loan_data_cleaned_audit.json
@@ -0,0 +1,142 @@
+{
+  "metadata": {
+    "timestamp": "2025-11-07T08:59:26.645555",
+    "original_rows": 45000,
+    "original_columns": 14,
+    "cleaned_rows": 45000,
+    "cleaned_columns": 13,
+    "presidio_version": "enabled",
+    "gpu_acceleration": {
+      "enabled": true,
+      "cuda_available": true,
+      "device": "NVIDIA GeForce RTX 4050 Laptop GPU",
+      "gpu_memory_gb": 5.99658203125
+    }
+  },
+  "summary": {
+    "columns_removed": [
+      "person_education"
+    ],
+    "columns_anonymized": [
+      "loan_intent",
+      "person_home_ownership"
+    ],
+    "total_cells_affected": 49906
+  },
+  "details": {
+    "loan_intent": {
+      "action": "ANONYMIZED",
+      "strategies_applied": [
+        "HASH"
+      ],
+      "reason": "Contains ORGANIZATION entities. Applied hash anonymization to protect privacy.",
+      "entity_types_found": [
+        "ORGANIZATION"
+      ],
+      "num_affected_rows": 23512,
+      "percentage_affected": "52.2%",
+      "examples": [
+        {
+          "before": "MEDICAL",
+          "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
+        },
+        {
+          "before": "MEDICAL",
+          "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
+        },
+        {
+          "before": "MEDICAL",
+          "after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
+        }
+      ],
+      "presidio_metrics": {
+        "avg_confidence": 0.85,
+        "detections": [
+          {
+            "entity_type": "ORGANIZATION",
+            "count": 49,
+            "avg_confidence": 0.85,
+            "max_confidence": 0.85,
+            "min_confidence": 0.85
+          }
+        ]
+      },
+      "gdpr_compliance": []
+    },
+    "person_home_ownership": {
+      "action": "ANONYMIZED",
+      "strategies_applied": [
+        "MASK"
+      ],
+      "reason": "Contains ORGANIZATION, LOCATION entities. Applied mask anonymization to protect privacy.",
+      "entity_types_found": [
+        "ORGANIZATION",
+        "LOCATION"
+      ],
+      "num_affected_rows": 26394,
+      "percentage_affected": "58.7%",
+      "examples": [
+        {
+          "before": "RENT",
+          "after": "****"
+        },
+        {
+          "before": "OWN",
+          "after": "***"
+        },
+        {
+          "before": "RENT",
+          "after": "****"
+        }
+      ],
+      "presidio_metrics": {
+        "avg_confidence": 0.85,
+        "detections": [
+          {
+            "entity_type": "ORGANIZATION",
+            "count": 24,
+            "avg_confidence": 0.85,
+            "max_confidence": 0.85,
+            "min_confidence": 0.85
+          },
+          {
+            "entity_type": "LOCATION",
+            "count": 49,
+            "avg_confidence": 0.85,
+            "max_confidence": 0.85,
+            "min_confidence": 0.85
+          }
+        ]
+      },
+      "gdpr_compliance": [
+        "Art. 4(1) - Personal data (location)"
+      ]
+    },
+    "person_education": {
+      "action": "REMOVED",
+      "reason": "Contains HIGH risk PII requiring removal",
+      "entity_types_found": [
+        "ORGANIZATION"
+      ],
+      "risk_level": "HIGH",
+      "presidio_metrics": {
+        "detections": [
+          {
+            "entity_type": "ORGANIZATION",
+            "count": 4,
+            "avg_confidence": 0.85,
+            "max_confidence": 0.85,
+            "min_confidence": 0.85
+          }
+        ]
+      },
+      "gdpr_compliance": []
+    }
+  },
+  "compliance": {
+    "gdpr_articles_applied": [
+      "Art. 4(1) - Personal data (location)"
+    ],
+    "risk_mitigation": {}
+  }
+}
\ No newline at end of file
diff --git a/cleaning.py b/cleaning.py
index a83d0da..127701a 100644
--- a/cleaning.py
+++ b/cleaning.py
@@ -21,7 +21,6 @@ try:
 except ImportError:
     PRESIDIO_AVAILABLE = False
     print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer")
-w    ad 
 # GPU detection
 try:
     import torch