From 927b91951820ce3793baad174f0898ca01705191 Mon Sep 17 00:00:00 2001
From: dionjoshualobo <23h13.joshua@sjec.ac.in>
Date: Fri, 7 Nov 2025 08:35:37 +0530
Subject: [PATCH 1/2] Updated gitignore and requirements

---
 .gitignore       | 1 +
 requirements.txt | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index d6f6a5a..e13f817 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ __pycache__/
 *.so
 .Python
 build/
+output
 develop-eggs/
 dist/
 downloads/
diff --git a/requirements.txt b/requirements.txt
index d7778da..e69a34b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,10 @@ pandas>=2.0.0
 numpy>=1.24.0
 scikit-learn>=1.3.0
 python-dateutil>=2.8.0
-setuptools>=65.0.0
\ No newline at end of file
+setuptools>=65.0.0
+
+# PII Detection and Anonymization (for cleaning.py)
+presidio-analyzer>=2.2.0
+presidio-anonymizer>=2.2.0
+spacy>=3.7.0
+# Download spaCy model with: python -m spacy download en_core_web_sm
\ No newline at end of file

From 747e2df066c019d814395661c80e52b795bc0259 Mon Sep 17 00:00:00 2001
From: PlatypusPus <23h46.shovin@sjec.ac.in>
Date: Fri, 7 Nov 2025 09:28:18 +0530
Subject: [PATCH 2/2] feat:Added CUDA

---
 cleaning.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 9 deletions(-)

diff --git a/cleaning.py b/cleaning.py
index f97eac6..a83d0da 100644
--- a/cleaning.py
+++ b/cleaning.py
@@ -1,6 +1,7 @@
 """
 Data Cleaning Module - PII Detection and Anonymization
 Handles GDPR-compliant data cleaning using Presidio for PII detection
+GPU-accelerated for faster processing of large datasets
 """
 
 import pandas as pd
@@ -20,6 +21,36 @@ try:
 except ImportError:
     PRESIDIO_AVAILABLE = False
     print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer")
+w    ad 
+# GPU detection
+try:
+    import torch
+    CUDA_AVAILABLE = torch.cuda.is_available()
+    if CUDA_AVAILABLE:
+        GPU_DEVICE = 0  # Use first GPU
+        GPU_NAME = torch.cuda.get_device_name(0)
+        GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / 1024**3  # GB
+    else:
+        GPU_DEVICE = -1
+        GPU_NAME = None
+        GPU_MEMORY = 0
+except ImportError:
+    CUDA_AVAILABLE = False
+    GPU_DEVICE = -1
+    GPU_NAME = None
+    GPU_MEMORY = 0
+
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+    # Check if spaCy can use GPU
+    if CUDA_AVAILABLE:
+        spacy.require_gpu()
+except ImportError:
+    SPACY_AVAILABLE = False
+except Exception:
+    # GPU not available for spaCy, will fall back to CPU
+    pass
 
 
 class CleaningConfig:
@@ -97,18 +128,23 @@ class DataCleaner:
         ... )
     """
     
-    def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None):
+    def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None, use_gpu: bool = True):
         """
         Initialize the data cleaner
         
         Args:
             df: Input DataFrame to clean
             config: Optional custom configuration
+            use_gpu: Whether to use GPU acceleration if available (default: True)
         """
         self.df = df.copy()
         self.config = config or CleaningConfig()
         self.audit_log = []
         self.cleaning_actions = {}
+        self.use_gpu = use_gpu and CUDA_AVAILABLE
+        
+        # Display GPU info
+        self._display_gpu_info()
         
         # Initialize Presidio engines
         if PRESIDIO_AVAILABLE:
@@ -119,8 +155,29 @@ class DataCleaner:
                 "Install with: pip install presidio-analyzer presidio-anonymizer"
             )
     
+    def _display_gpu_info(self):
+        """Display GPU availability and configuration"""
+        print("\n" + "="*70)
+        print("🖥️  HARDWARE CONFIGURATION")
+        print("="*70)
+        
+        if CUDA_AVAILABLE and self.use_gpu:
+            print(f"✓ GPU ACCELERATION: ENABLED")
+            print(f"  Device: {GPU_NAME}")
+            print(f"  Memory: {GPU_MEMORY:.2f} GB")
+            print(f"  CUDA Device ID: {GPU_DEVICE}")
+        elif CUDA_AVAILABLE and not self.use_gpu:
+            print(f"⚠️  GPU ACCELERATION: DISABLED (use_gpu=False)")
+            print(f"  Available GPU: {GPU_NAME} ({GPU_MEMORY:.2f} GB)")
+        else:
+            print(f"⚠️  GPU ACCELERATION: NOT AVAILABLE")
+            print(f"  Reason: {'PyTorch not installed' if not 'torch' in dir() else 'No CUDA device detected'}")
+            print(f"  Install: pip install torch --index-url https://download.pytorch.org/whl/cu121")
+        
+        print("="*70 + "\n")
+    
     def _init_presidio(self):
-        """Initialize Presidio analyzer and anonymizer engines"""
+        """Initialize Presidio analyzer and anonymizer engines with GPU support"""
         # Create NLP engine configuration
         configuration = {
             "nlp_engine_name": "spacy",
@@ -132,11 +189,23 @@ class DataCleaner:
             provider = NlpEngineProvider(nlp_configuration=configuration)
             nlp_engine = provider.create_engine()
             
+            # Enable GPU for spaCy if available
+            if self.use_gpu and SPACY_AVAILABLE:
+                try:
+                    import spacy
+                    # Move spaCy model to GPU
+                    spacy.require_gpu()
+                    print("✓ spaCy GPU acceleration enabled")
+                except Exception as e:
+                    print(f"⚠️  Could not enable spaCy GPU: {e}")
+                    print("  Falling back to CPU for NLP processing")
+            
             # Create analyzer with NLP engine
             self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
             self.anonymizer = AnonymizerEngine()
             
-            print("✓ Presidio engines initialized successfully")
+            device_info = "GPU" if self.use_gpu else "CPU"
+            print(f"✓ Presidio engines initialized successfully ({device_info} mode)")
         except Exception as e:
             # Fallback to default configuration if spaCy model not available
             print(f"Warning: Could not load spaCy model, using default configuration: {e}")
@@ -207,7 +276,7 @@ class DataCleaner:
         scan_all_cells: bool
     ) -> Dict[str, List[Dict]]:
         """
-        Detect PII at column and cell level
+        Detect PII at column and cell level (GPU-accelerated when available)
         
         Returns:
             Dictionary mapping column names to list of detected entities
@@ -226,7 +295,8 @@ class DataCleaner:
             text_columns = df.select_dtypes(include=['object']).columns.tolist()
             columns_to_scan = list(set(columns_to_scan + text_columns))
         
-        print(f"  Scanning {len(columns_to_scan)} columns: {columns_to_scan}")
+        device_info = f"GPU ({GPU_NAME})" if self.use_gpu else "CPU"
+        print(f"  Scanning {len(columns_to_scan)} columns using {device_info}: {columns_to_scan}")
         
         for column in columns_to_scan:
             print(f"  Analyzing '{column}'...", end=" ")
@@ -526,7 +596,13 @@ class DataCleaner:
                 'original_columns': len(self.df.columns),
                 'cleaned_rows': len(cleaned_df),
                 'cleaned_columns': len(cleaned_df.columns),
-                'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled'
+                'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled',
+                'gpu_acceleration': {
+                    'enabled': self.use_gpu,
+                    'cuda_available': CUDA_AVAILABLE,
+                    'device': GPU_NAME if self.use_gpu else 'CPU',
+                    'gpu_memory_gb': GPU_MEMORY if self.use_gpu else 0
+                }
             },
             'summary': {
                 'columns_removed': [],
@@ -706,19 +782,22 @@ def main():
     import sys
     
     if len(sys.argv) < 2:
-        print("Usage: python cleaning.py <data_file.csv>")
+        print("Usage: python cleaning.py <data_file.csv> [--no-gpu]")
         print("Example: python cleaning.py Datasets/loan_data.csv")
+        print("Options:")
+        print("  --no-gpu    Disable GPU acceleration (use CPU only)")
         sys.exit(1)
     
     data_path = sys.argv[1]
+    use_gpu = '--no-gpu' not in sys.argv
     
     # Load data
     print(f"Loading data from {data_path}...")
     df = pd.read_csv(data_path)
     print(f"Loaded {len(df)} rows × {len(df.columns)} columns")
     
-    # Initialize cleaner
-    cleaner = DataCleaner(df)
+    # Initialize cleaner with GPU support
+    cleaner = DataCleaner(df, use_gpu=use_gpu)
     
     # Run cleaning (interactive mode)
     cleaned_df, audit_report = cleaner.clean(