From 927b91951820ce3793baad174f0898ca01705191 Mon Sep 17 00:00:00 2001 From: dionjoshualobo <23h13.joshua@sjec.ac.in> Date: Fri, 7 Nov 2025 08:35:37 +0530 Subject: [PATCH 1/2] Updated gitignore and requirements --- .gitignore | 1 + requirements.txt | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d6f6a5a..e13f817 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ *.so .Python build/ +output develop-eggs/ dist/ downloads/ diff --git a/requirements.txt b/requirements.txt index d7778da..e69a34b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,10 @@ pandas>=2.0.0 numpy>=1.24.0 scikit-learn>=1.3.0 python-dateutil>=2.8.0 -setuptools>=65.0.0 \ No newline at end of file +setuptools>=65.0.0 + +# PII Detection and Anonymization (for cleaning.py) +presidio-analyzer>=2.2.0 +presidio-anonymizer>=2.2.0 +spacy>=3.7.0 +# Download spaCy model with: python -m spacy download en_core_web_sm \ No newline at end of file From 747e2df066c019d814395661c80e52b795bc0259 Mon Sep 17 00:00:00 2001 From: PlatypusPus <23h46.shovin@sjec.ac.in> Date: Fri, 7 Nov 2025 09:28:18 +0530 Subject: [PATCH 2/2] feat:Added CUDA --- cleaning.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 9 deletions(-) diff --git a/cleaning.py b/cleaning.py index f97eac6..a83d0da 100644 --- a/cleaning.py +++ b/cleaning.py @@ -1,6 +1,7 @@ """ Data Cleaning Module - PII Detection and Anonymization Handles GDPR-compliant data cleaning using Presidio for PII detection +GPU-accelerated for faster processing of large datasets """ import pandas as pd @@ -20,6 +21,36 @@ try: except ImportError: PRESIDIO_AVAILABLE = False print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer") +w ad +# GPU detection +try: + import torch + CUDA_AVAILABLE = torch.cuda.is_available() + if CUDA_AVAILABLE: + GPU_DEVICE = 0 # Use first GPU + GPU_NAME = torch.cuda.get_device_name(0) + GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / 1024**3 # GB + else: + GPU_DEVICE = -1 + GPU_NAME = None + GPU_MEMORY = 0 +except ImportError: + CUDA_AVAILABLE = False + GPU_DEVICE = -1 + GPU_NAME = None + GPU_MEMORY = 0 + +try: + import spacy + SPACY_AVAILABLE = True + # Check if spaCy can use GPU + if CUDA_AVAILABLE: + spacy.require_gpu() +except ImportError: + SPACY_AVAILABLE = False +except Exception: + # GPU not available for spaCy, will fall back to CPU + pass class CleaningConfig: @@ -97,18 +128,23 @@ class DataCleaner: ... ) """ - def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None): + def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None, use_gpu: bool = True): """ Initialize the data cleaner Args: df: Input DataFrame to clean config: Optional custom configuration + use_gpu: Whether to use GPU acceleration if available (default: True) """ self.df = df.copy() self.config = config or CleaningConfig() self.audit_log = [] self.cleaning_actions = {} + self.use_gpu = use_gpu and CUDA_AVAILABLE + + # Display GPU info + self._display_gpu_info() # Initialize Presidio engines if PRESIDIO_AVAILABLE: @@ -119,8 +155,29 @@ class DataCleaner: "Install with: pip install presidio-analyzer presidio-anonymizer" ) + def _display_gpu_info(self): + """Display GPU availability and configuration""" + print("\n" + "="*70) + print("🖥️ HARDWARE CONFIGURATION") + print("="*70) + + if CUDA_AVAILABLE and self.use_gpu: + print(f"✓ GPU ACCELERATION: ENABLED") + print(f" Device: {GPU_NAME}") + print(f" Memory: {GPU_MEMORY:.2f} GB") + print(f" CUDA Device ID: {GPU_DEVICE}") + elif CUDA_AVAILABLE and not self.use_gpu: + print(f"⚠️ GPU ACCELERATION: DISABLED (use_gpu=False)") + print(f" Available GPU: {GPU_NAME} ({GPU_MEMORY:.2f} GB)") + else: + print(f"⚠️ GPU ACCELERATION: NOT AVAILABLE") + print(f" Reason: {'PyTorch not installed' if not 'torch' in dir() else 'No CUDA device detected'}") + print(f" Install: pip install torch --index-url https://download.pytorch.org/whl/cu121") + + print("="*70 + "\n") + def _init_presidio(self): - """Initialize Presidio analyzer and anonymizer engines""" + """Initialize Presidio analyzer and anonymizer engines with GPU support""" # Create NLP engine configuration configuration = { "nlp_engine_name": "spacy", @@ -132,11 +189,23 @@ class DataCleaner: provider = NlpEngineProvider(nlp_configuration=configuration) nlp_engine = provider.create_engine() + # Enable GPU for spaCy if available + if self.use_gpu and SPACY_AVAILABLE: + try: + import spacy + # Move spaCy model to GPU + spacy.require_gpu() + print("✓ spaCy GPU acceleration enabled") + except Exception as e: + print(f"⚠️ Could not enable spaCy GPU: {e}") + print(" Falling back to CPU for NLP processing") + # Create analyzer with NLP engine self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine) self.anonymizer = AnonymizerEngine() - print("✓ Presidio engines initialized successfully") + device_info = "GPU" if self.use_gpu else "CPU" + print(f"✓ Presidio engines initialized successfully ({device_info} mode)") except Exception as e: # Fallback to default configuration if spaCy model not available print(f"Warning: Could not load spaCy model, using default configuration: {e}") @@ -207,7 +276,7 @@ class DataCleaner: scan_all_cells: bool ) -> Dict[str, List[Dict]]: """ - Detect PII at column and cell level + Detect PII at column and cell level (GPU-accelerated when available) Returns: Dictionary mapping column names to list of detected entities @@ -226,7 +295,8 @@ class DataCleaner: text_columns = df.select_dtypes(include=['object']).columns.tolist() columns_to_scan = list(set(columns_to_scan + text_columns)) - print(f" Scanning {len(columns_to_scan)} columns: {columns_to_scan}") + device_info = f"GPU ({GPU_NAME})" if self.use_gpu else "CPU" + print(f" Scanning {len(columns_to_scan)} columns using {device_info}: {columns_to_scan}") for column in columns_to_scan: print(f" Analyzing '{column}'...", end=" ") @@ -526,7 +596,13 @@ class DataCleaner: 'original_columns': len(self.df.columns), 'cleaned_rows': len(cleaned_df), 'cleaned_columns': len(cleaned_df.columns), - 'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled' + 'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled', + 'gpu_acceleration': { + 'enabled': self.use_gpu, + 'cuda_available': CUDA_AVAILABLE, + 'device': GPU_NAME if self.use_gpu else 'CPU', + 'gpu_memory_gb': GPU_MEMORY if self.use_gpu else 0 + } }, 'summary': { 'columns_removed': [], @@ -706,19 +782,22 @@ def main(): import sys if len(sys.argv) < 2: - print("Usage: python cleaning.py ") + print("Usage: python cleaning.py [--no-gpu]") print("Example: python cleaning.py Datasets/loan_data.csv") + print("Options:") + print(" --no-gpu Disable GPU acceleration (use CPU only)") sys.exit(1) data_path = sys.argv[1] + use_gpu = '--no-gpu' not in sys.argv # Load data print(f"Loading data from {data_path}...") df = pd.read_csv(data_path) print(f"Loaded {len(df)} rows × {len(df.columns)} columns") - # Initialize cleaner - cleaner = DataCleaner(df) + # Initialize cleaner with GPU support + cleaner = DataCleaner(df, use_gpu=use_gpu) # Run cleaning (interactive mode) cleaned_df, audit_report = cleaner.clean(