mirror of
https://github.com/PlatypusPus/MushroomEmpire.git
synced 2026-02-07 22:18:59 +00:00
Merge branch 'main' into main
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -57,4 +57,6 @@ reports/*.pdf
|
|||||||
|
|
||||||
# Data (keep demo dataset)
|
# Data (keep demo dataset)
|
||||||
*.csv
|
*.csv
|
||||||
!Datasets/loan_data.csv
|
!Datasets/loan_data.csv
|
||||||
|
|
||||||
|
Data
|
||||||
142
Datasets/loan_data_cleaned_audit.json
Normal file
142
Datasets/loan_data_cleaned_audit.json
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"timestamp": "2025-11-07T08:59:26.645555",
|
||||||
|
"original_rows": 45000,
|
||||||
|
"original_columns": 14,
|
||||||
|
"cleaned_rows": 45000,
|
||||||
|
"cleaned_columns": 13,
|
||||||
|
"presidio_version": "enabled",
|
||||||
|
"gpu_acceleration": {
|
||||||
|
"enabled": true,
|
||||||
|
"cuda_available": true,
|
||||||
|
"device": "NVIDIA GeForce RTX 4050 Laptop GPU",
|
||||||
|
"gpu_memory_gb": 5.99658203125
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"summary": {
|
||||||
|
"columns_removed": [
|
||||||
|
"person_education"
|
||||||
|
],
|
||||||
|
"columns_anonymized": [
|
||||||
|
"loan_intent",
|
||||||
|
"person_home_ownership"
|
||||||
|
],
|
||||||
|
"total_cells_affected": 49906
|
||||||
|
},
|
||||||
|
"details": {
|
||||||
|
"loan_intent": {
|
||||||
|
"action": "ANONYMIZED",
|
||||||
|
"strategies_applied": [
|
||||||
|
"HASH"
|
||||||
|
],
|
||||||
|
"reason": "Contains ORGANIZATION entities. Applied hash anonymization to protect privacy.",
|
||||||
|
"entity_types_found": [
|
||||||
|
"ORGANIZATION"
|
||||||
|
],
|
||||||
|
"num_affected_rows": 23512,
|
||||||
|
"percentage_affected": "52.2%",
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"before": "MEDICAL",
|
||||||
|
"after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"before": "MEDICAL",
|
||||||
|
"after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"before": "MEDICAL",
|
||||||
|
"after": "a978e21c3754862e57020380a3e9ea7ed66e16dfa3db6fb28b"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"presidio_metrics": {
|
||||||
|
"avg_confidence": 0.85,
|
||||||
|
"detections": [
|
||||||
|
{
|
||||||
|
"entity_type": "ORGANIZATION",
|
||||||
|
"count": 49,
|
||||||
|
"avg_confidence": 0.85,
|
||||||
|
"max_confidence": 0.85,
|
||||||
|
"min_confidence": 0.85
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"gdpr_compliance": []
|
||||||
|
},
|
||||||
|
"person_home_ownership": {
|
||||||
|
"action": "ANONYMIZED",
|
||||||
|
"strategies_applied": [
|
||||||
|
"MASK"
|
||||||
|
],
|
||||||
|
"reason": "Contains ORGANIZATION, LOCATION entities. Applied mask anonymization to protect privacy.",
|
||||||
|
"entity_types_found": [
|
||||||
|
"ORGANIZATION",
|
||||||
|
"LOCATION"
|
||||||
|
],
|
||||||
|
"num_affected_rows": 26394,
|
||||||
|
"percentage_affected": "58.7%",
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"before": "RENT",
|
||||||
|
"after": "****"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"before": "OWN",
|
||||||
|
"after": "***"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"before": "RENT",
|
||||||
|
"after": "****"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"presidio_metrics": {
|
||||||
|
"avg_confidence": 0.85,
|
||||||
|
"detections": [
|
||||||
|
{
|
||||||
|
"entity_type": "ORGANIZATION",
|
||||||
|
"count": 24,
|
||||||
|
"avg_confidence": 0.85,
|
||||||
|
"max_confidence": 0.85,
|
||||||
|
"min_confidence": 0.85
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_type": "LOCATION",
|
||||||
|
"count": 49,
|
||||||
|
"avg_confidence": 0.85,
|
||||||
|
"max_confidence": 0.85,
|
||||||
|
"min_confidence": 0.85
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"gdpr_compliance": [
|
||||||
|
"Art. 4(1) - Personal data (location)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"person_education": {
|
||||||
|
"action": "REMOVED",
|
||||||
|
"reason": "Contains HIGH risk PII requiring removal",
|
||||||
|
"entity_types_found": [
|
||||||
|
"ORGANIZATION"
|
||||||
|
],
|
||||||
|
"risk_level": "HIGH",
|
||||||
|
"presidio_metrics": {
|
||||||
|
"detections": [
|
||||||
|
{
|
||||||
|
"entity_type": "ORGANIZATION",
|
||||||
|
"count": 4,
|
||||||
|
"avg_confidence": 0.85,
|
||||||
|
"max_confidence": 0.85,
|
||||||
|
"min_confidence": 0.85
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"gdpr_compliance": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"compliance": {
|
||||||
|
"gdpr_articles_applied": [
|
||||||
|
"Art. 4(1) - Personal data (location)"
|
||||||
|
],
|
||||||
|
"risk_mitigation": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
105
cleaning.py
105
cleaning.py
@@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Data Cleaning Module - PII Detection and Anonymization
|
Data Cleaning Module - PII Detection and Anonymization
|
||||||
Handles GDPR-compliant data cleaning using Presidio for PII detection
|
Handles GDPR-compliant data cleaning using Presidio for PII detection
|
||||||
|
GPU-accelerated for faster processing of large datasets
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -20,6 +21,35 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
PRESIDIO_AVAILABLE = False
|
PRESIDIO_AVAILABLE = False
|
||||||
print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer")
|
print("Warning: Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer")
|
||||||
|
# GPU detection
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
CUDA_AVAILABLE = torch.cuda.is_available()
|
||||||
|
if CUDA_AVAILABLE:
|
||||||
|
GPU_DEVICE = 0 # Use first GPU
|
||||||
|
GPU_NAME = torch.cuda.get_device_name(0)
|
||||||
|
GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / 1024**3 # GB
|
||||||
|
else:
|
||||||
|
GPU_DEVICE = -1
|
||||||
|
GPU_NAME = None
|
||||||
|
GPU_MEMORY = 0
|
||||||
|
except ImportError:
|
||||||
|
CUDA_AVAILABLE = False
|
||||||
|
GPU_DEVICE = -1
|
||||||
|
GPU_NAME = None
|
||||||
|
GPU_MEMORY = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
SPACY_AVAILABLE = True
|
||||||
|
# Check if spaCy can use GPU
|
||||||
|
if CUDA_AVAILABLE:
|
||||||
|
spacy.require_gpu()
|
||||||
|
except ImportError:
|
||||||
|
SPACY_AVAILABLE = False
|
||||||
|
except Exception:
|
||||||
|
# GPU not available for spaCy, will fall back to CPU
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def convert_to_json_serializable(obj):
|
def convert_to_json_serializable(obj):
|
||||||
@@ -112,18 +142,23 @@ class DataCleaner:
|
|||||||
... )
|
... )
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None):
|
def __init__(self, df: pd.DataFrame, config: Optional[CleaningConfig] = None, use_gpu: bool = True):
|
||||||
"""
|
"""
|
||||||
Initialize the data cleaner
|
Initialize the data cleaner
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
df: Input DataFrame to clean
|
df: Input DataFrame to clean
|
||||||
config: Optional custom configuration
|
config: Optional custom configuration
|
||||||
|
use_gpu: Whether to use GPU acceleration if available (default: True)
|
||||||
"""
|
"""
|
||||||
self.df = df.copy()
|
self.df = df.copy()
|
||||||
self.config = config or CleaningConfig()
|
self.config = config or CleaningConfig()
|
||||||
self.audit_log = []
|
self.audit_log = []
|
||||||
self.cleaning_actions = {}
|
self.cleaning_actions = {}
|
||||||
|
self.use_gpu = use_gpu and CUDA_AVAILABLE
|
||||||
|
|
||||||
|
# Display GPU info
|
||||||
|
self._display_gpu_info()
|
||||||
|
|
||||||
# Initialize Presidio engines
|
# Initialize Presidio engines
|
||||||
if PRESIDIO_AVAILABLE:
|
if PRESIDIO_AVAILABLE:
|
||||||
@@ -134,8 +169,29 @@ class DataCleaner:
|
|||||||
"Install with: pip install presidio-analyzer presidio-anonymizer"
|
"Install with: pip install presidio-analyzer presidio-anonymizer"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _display_gpu_info(self):
|
||||||
|
"""Display GPU availability and configuration"""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("🖥️ HARDWARE CONFIGURATION")
|
||||||
|
print("="*70)
|
||||||
|
|
||||||
|
if CUDA_AVAILABLE and self.use_gpu:
|
||||||
|
print(f"✓ GPU ACCELERATION: ENABLED")
|
||||||
|
print(f" Device: {GPU_NAME}")
|
||||||
|
print(f" Memory: {GPU_MEMORY:.2f} GB")
|
||||||
|
print(f" CUDA Device ID: {GPU_DEVICE}")
|
||||||
|
elif CUDA_AVAILABLE and not self.use_gpu:
|
||||||
|
print(f"⚠️ GPU ACCELERATION: DISABLED (use_gpu=False)")
|
||||||
|
print(f" Available GPU: {GPU_NAME} ({GPU_MEMORY:.2f} GB)")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ GPU ACCELERATION: NOT AVAILABLE")
|
||||||
|
print(f" Reason: {'PyTorch not installed' if not 'torch' in dir() else 'No CUDA device detected'}")
|
||||||
|
print(f" Install: pip install torch --index-url https://download.pytorch.org/whl/cu121")
|
||||||
|
|
||||||
|
print("="*70 + "\n")
|
||||||
|
|
||||||
def _init_presidio(self):
|
def _init_presidio(self):
|
||||||
"""Initialize Presidio analyzer and anonymizer engines with Nordic recognizers"""
|
"""Initialize Presidio analyzer and anonymizer engines with GPU support"""
|
||||||
# Create NLP engine configuration
|
# Create NLP engine configuration
|
||||||
configuration = {
|
configuration = {
|
||||||
"nlp_engine_name": "spacy",
|
"nlp_engine_name": "spacy",
|
||||||
@@ -147,18 +203,23 @@ class DataCleaner:
|
|||||||
provider = NlpEngineProvider(nlp_configuration=configuration)
|
provider = NlpEngineProvider(nlp_configuration=configuration)
|
||||||
nlp_engine = provider.create_engine()
|
nlp_engine = provider.create_engine()
|
||||||
|
|
||||||
# Create registry and add Nordic recognizers
|
# Enable GPU for spaCy if available
|
||||||
registry = RecognizerRegistry()
|
if self.use_gpu and SPACY_AVAILABLE:
|
||||||
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
try:
|
||||||
|
import spacy
|
||||||
|
# Move spaCy model to GPU
|
||||||
|
spacy.require_gpu()
|
||||||
|
print("✓ spaCy GPU acceleration enabled")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Could not enable spaCy GPU: {e}")
|
||||||
|
print(" Falling back to CPU for NLP processing")
|
||||||
|
|
||||||
# Add Nordic-specific recognizers
|
# Create analyzer with NLP engine
|
||||||
self._add_nordic_recognizers(registry)
|
self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
|
||||||
|
|
||||||
# Create analyzer with custom registry
|
|
||||||
self.analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine)
|
|
||||||
self.anonymizer = AnonymizerEngine()
|
self.anonymizer = AnonymizerEngine()
|
||||||
|
|
||||||
print("✓ Presidio engines initialized with Nordic PII recognizers")
|
device_info = "GPU" if self.use_gpu else "CPU"
|
||||||
|
print(f"✓ Presidio engines initialized successfully ({device_info} mode)")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fallback to default configuration if spaCy model not available
|
# Fallback to default configuration if spaCy model not available
|
||||||
print(f"Warning: Could not load spaCy model, using default configuration: {e}")
|
print(f"Warning: Could not load spaCy model, using default configuration: {e}")
|
||||||
@@ -313,7 +374,7 @@ class DataCleaner:
|
|||||||
scan_all_cells: bool
|
scan_all_cells: bool
|
||||||
) -> Dict[str, List[Dict]]:
|
) -> Dict[str, List[Dict]]:
|
||||||
"""
|
"""
|
||||||
Detect PII at column and cell level
|
Detect PII at column and cell level (GPU-accelerated when available)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary mapping column names to list of detected entities
|
Dictionary mapping column names to list of detected entities
|
||||||
@@ -332,7 +393,8 @@ class DataCleaner:
|
|||||||
text_columns = df.select_dtypes(include=['object']).columns.tolist()
|
text_columns = df.select_dtypes(include=['object']).columns.tolist()
|
||||||
columns_to_scan = list(set(columns_to_scan + text_columns))
|
columns_to_scan = list(set(columns_to_scan + text_columns))
|
||||||
|
|
||||||
print(f" Scanning {len(columns_to_scan)} columns: {columns_to_scan}")
|
device_info = f"GPU ({GPU_NAME})" if self.use_gpu else "CPU"
|
||||||
|
print(f" Scanning {len(columns_to_scan)} columns using {device_info}: {columns_to_scan}")
|
||||||
|
|
||||||
for column in columns_to_scan:
|
for column in columns_to_scan:
|
||||||
print(f" Analyzing '{column}'...", end=" ")
|
print(f" Analyzing '{column}'...", end=" ")
|
||||||
@@ -632,7 +694,13 @@ class DataCleaner:
|
|||||||
'original_columns': len(self.df.columns),
|
'original_columns': len(self.df.columns),
|
||||||
'cleaned_rows': len(cleaned_df),
|
'cleaned_rows': len(cleaned_df),
|
||||||
'cleaned_columns': len(cleaned_df.columns),
|
'cleaned_columns': len(cleaned_df.columns),
|
||||||
'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled'
|
'presidio_version': 'enabled' if PRESIDIO_AVAILABLE else 'disabled',
|
||||||
|
'gpu_acceleration': {
|
||||||
|
'enabled': self.use_gpu,
|
||||||
|
'cuda_available': CUDA_AVAILABLE,
|
||||||
|
'device': GPU_NAME if self.use_gpu else 'CPU',
|
||||||
|
'gpu_memory_gb': GPU_MEMORY if self.use_gpu else 0
|
||||||
|
}
|
||||||
},
|
},
|
||||||
'summary': {
|
'summary': {
|
||||||
'total_rows': len(self.df),
|
'total_rows': len(self.df),
|
||||||
@@ -1285,19 +1353,22 @@ def main():
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print("Usage: python cleaning.py <data_file.csv>")
|
print("Usage: python cleaning.py <data_file.csv> [--no-gpu]")
|
||||||
print("Example: python cleaning.py Datasets/loan_data.csv")
|
print("Example: python cleaning.py Datasets/loan_data.csv")
|
||||||
|
print("Options:")
|
||||||
|
print(" --no-gpu Disable GPU acceleration (use CPU only)")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
data_path = sys.argv[1]
|
data_path = sys.argv[1]
|
||||||
|
use_gpu = '--no-gpu' not in sys.argv
|
||||||
|
|
||||||
# Load data
|
# Load data
|
||||||
print(f"Loading data from {data_path}...")
|
print(f"Loading data from {data_path}...")
|
||||||
df = pd.read_csv(data_path)
|
df = pd.read_csv(data_path)
|
||||||
print(f"Loaded {len(df)} rows × {len(df.columns)} columns")
|
print(f"Loaded {len(df)} rows × {len(df.columns)} columns")
|
||||||
|
|
||||||
# Initialize cleaner
|
# Initialize cleaner with GPU support
|
||||||
cleaner = DataCleaner(df)
|
cleaner = DataCleaner(df, use_gpu=use_gpu)
|
||||||
|
|
||||||
# Run cleaning (interactive mode)
|
# Run cleaning (interactive mode)
|
||||||
cleaned_df, audit_report = cleaner.clean(
|
cleaned_df, audit_report = cleaner.clean(
|
||||||
|
|||||||
167
discovery/main.py
Normal file
167
discovery/main.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
import csv
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import Counter
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
ROOT = Path("../Data/Politics")
|
||||||
|
|
||||||
|
# Try to import spaCy, fall back to basic extraction if not available
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
USE_SPACY = True
|
||||||
|
except:
|
||||||
|
USE_SPACY = False
|
||||||
|
|
||||||
|
# Regex patterns for deterministic detection
|
||||||
|
patterns = {
|
||||||
|
"EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"),
|
||||||
|
"PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"),
|
||||||
|
"UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
|
||||||
|
"IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
|
||||||
|
"DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
|
||||||
|
"URL": re.compile(r"https?://[^\s]+"),
|
||||||
|
"SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def find_entities(text):
|
||||||
|
"""Extract entities using regex patterns."""
|
||||||
|
found = {}
|
||||||
|
for label, pattern in patterns.items():
|
||||||
|
matches = pattern.findall(text)
|
||||||
|
if matches:
|
||||||
|
found[label] = list(set(matches))[:5] # Limit to 5 per type
|
||||||
|
return found
|
||||||
|
|
||||||
|
def extract_with_spacy(text):
|
||||||
|
"""Extract named entities using spaCy."""
|
||||||
|
if not USE_SPACY:
|
||||||
|
return {}, {}, {}
|
||||||
|
|
||||||
|
doc = nlp(text[:10000]) # Limit text length for performance
|
||||||
|
|
||||||
|
persons = []
|
||||||
|
orgs = []
|
||||||
|
locations = []
|
||||||
|
|
||||||
|
for ent in doc.ents:
|
||||||
|
if ent.label_ == "PERSON":
|
||||||
|
persons.append(ent.text)
|
||||||
|
elif ent.label_ == "ORG":
|
||||||
|
orgs.append(ent.text)
|
||||||
|
elif ent.label_ in ["GPE", "LOC"]:
|
||||||
|
locations.append(ent.text)
|
||||||
|
|
||||||
|
# Return most common entities
|
||||||
|
return (
|
||||||
|
dict(Counter(persons).most_common(5)),
|
||||||
|
dict(Counter(orgs).most_common(5)),
|
||||||
|
dict(Counter(locations).most_common(5))
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_metadata(text, filename):
|
||||||
|
"""Extract basic metadata from text."""
|
||||||
|
metadata = {
|
||||||
|
"char_count": len(text),
|
||||||
|
"word_count": len(text.split()),
|
||||||
|
"line_count": text.count('\n') + 1,
|
||||||
|
"file_extension": Path(filename).suffix,
|
||||||
|
}
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
def detect_content_type(text):
|
||||||
|
"""Heuristic content type detection."""
|
||||||
|
text_lower = text.lower()
|
||||||
|
|
||||||
|
# Check for common document types
|
||||||
|
if any(word in text_lower[:1000] for word in ['dear', 'sincerely', 'regards']):
|
||||||
|
return "letter"
|
||||||
|
elif any(word in text_lower[:500] for word in ['article', 'section', 'amendment']):
|
||||||
|
return "legal"
|
||||||
|
elif any(word in text_lower[:500] for word in ['press release', 'for immediate release']):
|
||||||
|
return "press_release"
|
||||||
|
elif re.search(r'^\s*#', text[:100], re.MULTILINE):
|
||||||
|
return "markdown"
|
||||||
|
elif '<html' in text_lower[:200]:
|
||||||
|
return "html"
|
||||||
|
else:
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
# Define fieldnames
|
||||||
|
fieldnames = [
|
||||||
|
"filename", "file_extension", "char_count", "word_count", "line_count",
|
||||||
|
"content_type", "text_preview",
|
||||||
|
"EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN",
|
||||||
|
"persons", "organizations", "locations"
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Processing files...")
|
||||||
|
with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
file_count = 0
|
||||||
|
for path in ROOT.rglob("*"):
|
||||||
|
if not path.is_file():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip binary files
|
||||||
|
if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error reading {path.name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_count += 1
|
||||||
|
if file_count % 10 == 0:
|
||||||
|
print(f"Processed {file_count} files...")
|
||||||
|
|
||||||
|
# Initialize row
|
||||||
|
row = {"filename": str(path.relative_to(ROOT.parent))}
|
||||||
|
|
||||||
|
# Extract metadata
|
||||||
|
metadata = extract_metadata(text, path.name)
|
||||||
|
row.update(metadata)
|
||||||
|
|
||||||
|
# Detect content type
|
||||||
|
row["content_type"] = detect_content_type(text)
|
||||||
|
row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
|
||||||
|
|
||||||
|
# Extract entities with regex
|
||||||
|
entities = find_entities(text)
|
||||||
|
for key, values in entities.items():
|
||||||
|
row[key] = "; ".join(values) if values else ""
|
||||||
|
|
||||||
|
# Fill in missing pattern fields
|
||||||
|
for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
|
||||||
|
if pattern_key not in row:
|
||||||
|
row[pattern_key] = ""
|
||||||
|
|
||||||
|
# Extract named entities with spaCy
|
||||||
|
if USE_SPACY:
|
||||||
|
persons, orgs, locs = extract_with_spacy(text)
|
||||||
|
row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()])
|
||||||
|
row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()])
|
||||||
|
row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()])
|
||||||
|
else:
|
||||||
|
row["persons"] = ""
|
||||||
|
row["organizations"] = ""
|
||||||
|
row["locations"] = ""
|
||||||
|
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
print(f"\nComplete! Processed {file_count} files.")
|
||||||
|
print(f"Output: discovery_dataset.csv")
|
||||||
|
|
||||||
|
# Print summary statistics
|
||||||
|
if file_count > 0:
|
||||||
|
print("\nTo install spaCy for better entity extraction:")
|
||||||
|
print(" pip install spacy")
|
||||||
|
print(" python -m spacy download en_core_web_sm")
|
||||||
Reference in New Issue
Block a user