From 7e3938d555e32e7527c28869a458dc88febd6b9b Mon Sep 17 00:00:00 2001
From: nearlynithin <nearlynithin@gmail.com>
Date: Fri, 7 Nov 2025 04:32:02 +0530
Subject: [PATCH] feat: regex data sanitization

---
 discovery/main.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 discovery/main.py

diff --git a/discovery/main.py b/discovery/main.py
new file mode 100644
index 0000000..03715b2
--- /dev/null
+++ b/discovery/main.py
@@ -0,0 +1,44 @@
+import csv
+import re
+from pathlib import Path
+
+ROOT = Path("../../archiv/Data/Politics")
+
+email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
+phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}")
+ssn_re = re.compile(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b")
+uuid_re = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b")
+pan_re = re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b")
+iban_re = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b")
+
+patterns = {
+    "EMAIL": email_re,
+    "PHONE": phone_re,
+    "SSN": ssn_re,
+    "UUID": uuid_re,
+    "PAN": pan_re,
+    "IBAN": iban_re,
+}
+
+def find_entities(text):
+    found = []
+    for label, pattern in patterns.items():
+        for m in pattern.finditer(text):
+            found.append(f"{label}: {m.group(0)}")
+    return found
+
+with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
+    writer = csv.writer(csvfile)
+    writer.writerow(["filename", "text", "detected_entities"])
+
+    for path in ROOT.rglob("*"):
+        if not path.is_file():
+            continue
+        try:
+            text = path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+        entities = find_entities(text)
+        if text.strip():  # skip empty
+            writer.writerow([str(path), text[:5000], "; ".join(entities)])  # limit length if huge