fix: better text extraction

2026-02-07 22:18:59 +00:00 · 2025-11-07 14:19:44 +05:30
parent d7dbcff4b6
commit beadb7ce87
3 changed files with 178 additions and 101 deletions
--- a/discovery/main.py
+++ b/discovery/main.py
@@ -1,64 +1,145 @@
-import csv
+import csv, tempfile
 import re
 from pathlib import Path
 from collections import Counter
 from datetime import datetime
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import FileResponse
+import uvicorn
+import pprint

-ROOT = Path("../Data/Politics")
+app = FastAPI()
+
+import spacy
+from spacy.matcher import PhraseMatcher, Matcher
+from collections import defaultdict
+import re

-# Try to import spaCy, fall back to basic extraction if not available
 try:
-    import spacy
+    nlp = spacy.load("en_core_web_trf")
+    USE_TRF = True
+except Exception:
    nlp = spacy.load("en_core_web_sm")
-    USE_SPACY = True
-except:
-    USE_SPACY = False
+    USE_TRF = False

-# Regex patterns for deterministic detection
-patterns = {
-    "EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"),
-    "PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"),
-    "UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
-    "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
-    "DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
-    "URL": re.compile(r"https?://[^\s]+"),
-    "SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
-}
+# small helper regexes for quick validation
+email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
+phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}")
+short_token_re = re.compile(r"^[A-Za-z]{1,2}$")

-def find_entities(text):
-    """Extract entities using regex patterns."""
-    found = {}
-    for label, pattern in patterns.items():
-        matches = pattern.findall(text)
-        if matches:
-            found[label] = list(set(matches))[:5]  # Limit to 5 per type
-    return found
+# blacklist/whitelist samples (extend for your domain)
+BLACKLIST = set(["The", "In", "On", "And", "If", "But"])
+WHITELIST_TITLES = set(["Dr.", "Mr.", "Mrs.", "Ms.", "Prof."])
+
+# optional high-precision phrase matcher for domain terms (invoices etc.)
+phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
+for phrase in ["invoice number", "due date", "invoice", "total amount", "amount due", "customer name"]:
+    phrase_matcher.add("INVOICE_FIELD", [nlp.make_doc(phrase)])
+
+# generic matcher to capture patterns like "Name: John Doe"
+matcher = Matcher(nlp.vocab)
+matcher.add("KV_PATTERN", [
+    [{"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "?"}, {"IS_TITLE": True, "OP": "+"}],
+])
+
+def find_regex_entities(text):
+    emails = "; ".join(email_re.findall(text))
+    phones = "; ".join(phone_re.findall(text))
+    return {"EMAIL": emails, "PHONE": phones}
+
+
+# chunking to process long texts without losing context
+def chunk_text(text, max_chars=3000, overlap=200):
+    start = 0
+    L = len(text)
+    while start < L:
+        end = start + max_chars
+        if end >= L:
+            yield text[start:L]
+            break
+        # try to end at newline or space within a small window
+        cut = text.rfind("\n", start, end)
+        if cut <= start:
+            cut = text.rfind(" ", start, end)
+        if cut <= start:
+            cut = end
+        yield text[start:cut]
+        start = max(cut - overlap, cut)  # overlap for context
+
+def is_likely_name(ent_text):
+    # filter out very short tokens and blacklisted tokens
+    if short_token_re.match(ent_text):
+        return False
+    if any(tok.lower() in ("invoice", "total", "amount", "date", "http", "www") for tok in ent_text.split()):
+        return False
+    if ent_text.split()[0] in BLACKLIST:
+        return False
+    return True
+
+def add_entity(agg, ent_text, ctx):
+    rec = agg[ent_text]
+    rec["count"] += 1
+    if len(rec["examples"]) < 3:
+        rec["examples"].append(ctx)

 def extract_with_spacy(text):
-    """Extract named entities using spaCy."""
-    if not USE_SPACY:
-        return {}, {}, {}
+    persons = defaultdict(lambda: {"count": 0, "examples": []})
+    orgs = defaultdict(lambda: {"count": 0, "examples": []})
+    locs = defaultdict(lambda: {"count": 0, "examples": []})

-    doc = nlp(text[:10000])  # Limit text length for performance
-    
-    persons = []
-    orgs = []
-    locations = []
+    for chunk in chunk_text(text):
+        doc = nlp(chunk)
+        for _, start, end in phrase_matcher(doc):
+            span = doc[start:end]
+        for match_id, start, end in matcher(doc):
+            span = doc[start:end]
+            # not necessarily an entity, but may give context

        for ent in doc.ents:
-        if ent.label_ == "PERSON":
-            persons.append(ent.text)
-        elif ent.label_ == "ORG":
-            orgs.append(ent.text)
-        elif ent.label_ in ["GPE", "LOC"]:
-            locations.append(ent.text)
+            text_ent = ent.text.strip()
+            label = ent.label_

-    # Return most common entities
-    return (
-        dict(Counter(persons).most_common(5)),
-        dict(Counter(orgs).most_common(5)),
-        dict(Counter(locations).most_common(5))
-    )
+            # basic filtering rules
+            if len(text_ent) < 2:
+                continue
+            if text_ent in BLACKLIST:
+                continue
+
+            # context snippet for examples (trim)
+            sent_ctx = ent.sent.text.strip()
+            if len(sent_ctx) > 200:
+                sent_ctx = sent_ctx[:200] + "..."
+
+            # label mapping - adapt to what spaCy model returns
+            if label in ("PERSON", "PER", "PERSONS"):
+                if is_likely_name(text_ent) or any(t in WHITELIST_TITLES for t in text_ent.split()):
+                    add_entity(persons, text_ent, sent_ctx)
+            elif label in ("ORG", "ORGANIZATION", "COMPANY"):
+                add_entity(orgs, text_ent, sent_ctx)
+            elif label in ("GPE", "LOC", "LOCATION", "CITY", "COUNTRY"):
+                add_entity(locs, text_ent, sent_ctx)
+            else:
+                pass
+
+        if USE_TRF:
+            for ent in doc.ents:
+                try:
+                    vec_norms = [t.vector_norm for t in ent]
+                    avg = sum(vec_norms) / max(len(vec_norms), 1)
+                    # if avg very small it's likely low-quality
+                    if avg < 5.0:
+                        # treat low-norm ent as lower confidence, optionally skip
+                        continue
+                except Exception:
+                    pass
+
+    def finalize(d):
+        out = {}
+        for k, v in d.items():
+            out[k] = {"count": v["count"], "examples": v["examples"]}
+        return out
+
+    return finalize(persons), finalize(orgs), finalize(locs)

 def extract_metadata(text, filename):
    """Extract basic metadata from text."""
@@ -96,72 +177,60 @@ fieldnames = [
    "persons", "organizations", "locations"
 ]

-print("Processing files...")
-with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
+
+# this route will accept every txt/log file that is not csv
+@app.post("/api/files")
+async def postFile(file: UploadFile):
+    if file.filename.endswith("csv"):
+        return {"error" : "Cannot accept csv files"}
+
+    try:
+        contents = await file.read()
+        text = contents.decode("utf-8", errors="ignore")
+    except Exception as e:
+        return {"error": f"Could not read file: {e}"}
+
+    # Skip empty uploads
+    if not text.strip():
+        return {"error": "File is empty or unreadable"}
+
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", newline="", encoding="utf-8") as csvfile:
+        print("Processing files...")
+        temp_path = Path(csvfile.name)
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

-    file_count = 0
-    for path in ROOT.rglob("*"):
-        if not path.is_file():
-            continue
+        row = {"filename" : file.filename}

-        # Skip binary files
-        if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']:
-            continue
-        
-        try:
-            text = path.read_text(encoding="utf-8", errors="ignore")
-        except Exception as e:
-            print(f"  Error reading {path.name}: {e}")
-            continue
-        
-        if not text.strip():
-            continue
-        
-        file_count += 1
-        if file_count % 10 == 0:
-            print(f"Processed {file_count} files...")
-        
-        # Initialize row
-        row = {"filename": str(path.relative_to(ROOT.parent))}
        
        # Extract metadata
-        metadata = extract_metadata(text, path.name)
+        metadata = extract_metadata(text, row["filename"])
        row.update(metadata)
        
        # Detect content type
        row["content_type"] = detect_content_type(text)
        row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
        
-        # Extract entities with regex
-        entities = find_entities(text)
-        for key, values in entities.items():
-            row[key] = "; ".join(values) if values else ""
        
        # Fill in missing pattern fields
        for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
            if pattern_key not in row:
                row[pattern_key] = ""
        
-        # Extract named entities with spaCy
-        if USE_SPACY:
        persons, orgs, locs = extract_with_spacy(text)
-            row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()])
-            row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()])
-            row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()])
-        else:
-            row["persons"] = ""
-            row["organizations"] = ""
-            row["locations"] = ""
+        regex_entities = find_regex_entities(text)
+        row.update(regex_entities)
+        row["persons"] = "; ".join([f"{k} ({v['count']})" for k, v in persons.items()])
+        row["organizations"] = "; ".join([f"{k} ({v['count']})" for k, v in orgs.items()])
+        row["locations"] = "; ".join([f"{k} ({v['count']})" for k, v in locs.items()])
+
        
        writer.writerow(row)
+        print(pprint.pprint(row))

-print(f"\nComplete! Processed {file_count} files.")
-print(f"Output: discovery_dataset.csv")
+        return FileResponse(
+            temp_path, media_type="text/csv", filename="dataset.csv"
+        )

-# Print summary statistics
-if file_count > 0:
-    print("\nTo install spaCy for better entity extraction:")
-    print("  pip install spacy")
-    print("  python -m spacy download en_core_web_sm")
+if __name__ == '__main__':
+   uvicorn.run(app) 
--- a/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx
+++ b/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx
@@ -1,7 +1,6 @@
 "use client";
 import { TryTab } from "./Sidebar";
 import { useState, useRef, useCallback, useEffect } from "react";
-import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/idb";

 interface CenterPanelProps {
 	tab: TryTab;
--- a/frontend/nordic-privacy-ai/package-lock.json
+++ b/frontend/nordic-privacy-ai/package-lock.json
@@ -1001,6 +1001,7 @@
      "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
      "dev": true,
      "license": "MIT",
+      "peer": true,
      "bin": {
        "acorn": "bin/acorn"
      },
@@ -1441,6 +1442,7 @@
        }
      ],
      "license": "MIT",
+      "peer": true,
      "dependencies": {
        "baseline-browser-mapping": "^2.8.19",
        "caniuse-lite": "^1.0.30001751",
@@ -2103,6 +2105,7 @@
      "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.",
      "dev": true,
      "license": "MIT",
+      "peer": true,
      "dependencies": {
        "@eslint-community/eslint-utils": "^4.2.0",
        "@eslint-community/regexpp": "^4.6.1",
@@ -2271,6 +2274,7 @@
      "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
      "dev": true,
      "license": "MIT",
+      "peer": true,
      "dependencies": {
        "@rtsao/scc": "^1.1.0",
        "array-includes": "^3.1.9",
@@ -4423,6 +4427,7 @@
        }
      ],
      "license": "MIT",
+      "peer": true,
      "dependencies": {
        "nanoid": "^3.3.7",
        "picocolors": "^1.0.0",
@@ -4581,6 +4586,7 @@
      "resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz",
      "integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
      "license": "MIT",
+      "peer": true,
      "dependencies": {
        "loose-envify": "^1.1.0"
      },
@@ -4593,6 +4599,7 @@
      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz",
      "integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==",
      "license": "MIT",
+      "peer": true,
      "dependencies": {
        "loose-envify": "^1.1.0",
        "scheduler": "^0.23.0"
@@ -5539,6 +5546,7 @@
      "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
      "dev": true,
      "license": "MIT",
+      "peer": true,
      "engines": {
        "node": ">=12"
      },
@@ -5708,6 +5716,7 @@
      "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
      "dev": true,
      "license": "Apache-2.0",
+      "peer": true,
      "bin": {
        "tsc": "bin/tsc",
        "tsserver": "bin/tsserver"