From 927b91951820ce3793baad174f0898ca01705191 Mon Sep 17 00:00:00 2001 From: dionjoshualobo <23h13.joshua@sjec.ac.in> Date: Fri, 7 Nov 2025 08:35:37 +0530 Subject: [PATCH 1/3] Updated gitignore and requirements --- .gitignore | 1 + requirements.txt | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d6f6a5a..e13f817 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ *.so .Python build/ +output develop-eggs/ dist/ downloads/ diff --git a/requirements.txt b/requirements.txt index d7778da..e69a34b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,10 @@ pandas>=2.0.0 numpy>=1.24.0 scikit-learn>=1.3.0 python-dateutil>=2.8.0 -setuptools>=65.0.0 \ No newline at end of file +setuptools>=65.0.0 + +# PII Detection and Anonymization (for cleaning.py) +presidio-analyzer>=2.2.0 +presidio-anonymizer>=2.2.0 +spacy>=3.7.0 +# Download spaCy model with: python -m spacy download en_core_web_sm \ No newline at end of file From f1af97f7c9bafacb353a6986e8d3a68d1c9d37e9 Mon Sep 17 00:00:00 2001 From: Chaithz Date: Fri, 7 Nov 2025 09:30:45 +0530 Subject: [PATCH 2/3] feat: csv generator --- discovery/main.py | 169 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 146 insertions(+), 23 deletions(-) diff --git a/discovery/main.py b/discovery/main.py index 03715b2..96fa8e7 100644 --- a/discovery/main.py +++ b/discovery/main.py @@ -1,44 +1,167 @@ import csv import re from pathlib import Path +from collections import Counter +from datetime import datetime -ROOT = Path("../../archiv/Data/Politics") +ROOT = Path("../Data/Politics") -email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") -phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}") -ssn_re = re.compile(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b") -uuid_re = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b") -pan_re = re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b") -iban_re = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b") +# Try to import spaCy, fall back to basic extraction if not available +try: + import spacy + nlp = spacy.load("en_core_web_sm") + USE_SPACY = True +except: + USE_SPACY = False +# Regex patterns for deterministic detection patterns = { - "EMAIL": email_re, - "PHONE": phone_re, - "SSN": ssn_re, - "UUID": uuid_re, - "PAN": pan_re, - "IBAN": iban_re, + "EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"), + "PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"), + "UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"), + "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"), + "DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"), + "URL": re.compile(r"https?://[^\s]+"), + "SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), } def find_entities(text): - found = [] + """Extract entities using regex patterns.""" + found = {} for label, pattern in patterns.items(): - for m in pattern.finditer(text): - found.append(f"{label}: {m.group(0)}") + matches = pattern.findall(text) + if matches: + found[label] = list(set(matches))[:5] # Limit to 5 per type return found -with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile: - writer = csv.writer(csvfile) - writer.writerow(["filename", "text", "detected_entities"]) +def extract_with_spacy(text): + """Extract named entities using spaCy.""" + if not USE_SPACY: + return {}, {}, {} + + doc = nlp(text[:10000]) # Limit text length for performance + + persons = [] + orgs = [] + locations = [] + + for ent in doc.ents: + if ent.label_ == "PERSON": + persons.append(ent.text) + elif ent.label_ == "ORG": + orgs.append(ent.text) + elif ent.label_ in ["GPE", "LOC"]: + locations.append(ent.text) + + # Return most common entities + return ( + dict(Counter(persons).most_common(5)), + dict(Counter(orgs).most_common(5)), + dict(Counter(locations).most_common(5)) + ) +def extract_metadata(text, filename): + """Extract basic metadata from text.""" + metadata = { + "char_count": len(text), + "word_count": len(text.split()), + "line_count": text.count('\n') + 1, + "file_extension": Path(filename).suffix, + } + return metadata + +def detect_content_type(text): + """Heuristic content type detection.""" + text_lower = text.lower() + + # Check for common document types + if any(word in text_lower[:1000] for word in ['dear', 'sincerely', 'regards']): + return "letter" + elif any(word in text_lower[:500] for word in ['article', 'section', 'amendment']): + return "legal" + elif any(word in text_lower[:500] for word in ['press release', 'for immediate release']): + return "press_release" + elif re.search(r'^\s*#', text[:100], re.MULTILINE): + return "markdown" + elif ' 0: + print("\nTo install spaCy for better entity extraction:") + print(" pip install spacy") + print(" python -m spacy download en_core_web_sm") \ No newline at end of file From 8ead47d54daa05e167e7db58abc5412bb3fcd920 Mon Sep 17 00:00:00 2001 From: Chaithz Date: Fri, 7 Nov 2025 09:32:16 +0530 Subject: [PATCH 3/3] ignore data dir --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d6f6a5a..a637d67 100644 --- a/.gitignore +++ b/.gitignore @@ -56,4 +56,6 @@ reports/*.pdf # Data (keep demo dataset) *.csv -!Datasets/loan_data.csv \ No newline at end of file +!Datasets/loan_data.csv + +Data \ No newline at end of file