From 7e3938d555e32e7527c28869a458dc88febd6b9b Mon Sep 17 00:00:00 2001 From: nearlynithin Date: Fri, 7 Nov 2025 04:32:02 +0530 Subject: [PATCH] feat: regex data sanitization --- discovery/main.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 discovery/main.py diff --git a/discovery/main.py b/discovery/main.py new file mode 100644 index 0000000..03715b2 --- /dev/null +++ b/discovery/main.py @@ -0,0 +1,44 @@ +import csv +import re +from pathlib import Path + +ROOT = Path("../../archiv/Data/Politics") + +email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") +phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}") +ssn_re = re.compile(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b") +uuid_re = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b") +pan_re = re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b") +iban_re = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b") + +patterns = { + "EMAIL": email_re, + "PHONE": phone_re, + "SSN": ssn_re, + "UUID": uuid_re, + "PAN": pan_re, + "IBAN": iban_re, +} + +def find_entities(text): + found = [] + for label, pattern in patterns.items(): + for m in pattern.finditer(text): + found.append(f"{label}: {m.group(0)}") + return found + +with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile: + writer = csv.writer(csvfile) + writer.writerow(["filename", "text", "detected_entities"]) + + for path in ROOT.rglob("*"): + if not path.is_file(): + continue + try: + text = path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + entities = find_entities(text) + if text.strip(): # skip empty + writer.writerow([str(path), text[:5000], "; ".join(entities)]) # limit length if huge