fix: better text extraction

2026-02-07 22:18:59 +00:00 · 2025-11-07 14:19:44 +05:30
parent d7dbcff4b6
commit beadb7ce87
3 changed files with 178 additions and 101 deletions
--- a/discovery/main.py
+++ b/discovery/main.py
@@ -1,64 +1,145 @@
-import csv
+import csv, tempfile
 import re
 from pathlib import Path
 from collections import Counter
 from datetime import datetime
 from fastapi import FastAPI, File, UploadFile
 from fastapi.responses import FileResponse
 import uvicorn
 import pprint
-ROOT = Path("../Data/Politics")
+app = FastAPI()
 # Try to import spaCy, fall back to basic extraction if not available
 try:
 import spacy
 from spacy.matcher import PhraseMatcher, Matcher
 from collections import defaultdict
 import re
 try:
    nlp = spacy.load("en_core_web_trf")
    USE_TRF = True
 except Exception:
    nlp = spacy.load("en_core_web_sm")
-    USE_SPACY = True
+    USE_TRF = False
 except:
    USE_SPACY = False
-# Regex patterns for deterministic detection
+# small helper regexes for quick validation
-patterns = {
+email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
-    "EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"),
+phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}")
-    "PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"),
+short_token_re = re.compile(r"^[A-Za-z]{1,2}$")
    "UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
    "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
    "DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
    "URL": re.compile(r"https?://[^\s]+"),
    "SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
 }
-def find_entities(text):
+# blacklist/whitelist samples (extend for your domain)
-    """Extract entities using regex patterns."""
+BLACKLIST = set(["The", "In", "On", "And", "If", "But"])
-    found = {}
+WHITELIST_TITLES = set(["Dr.", "Mr.", "Mrs.", "Ms.", "Prof."])
-    for label, pattern in patterns.items():
+
-        matches = pattern.findall(text)
+# optional high-precision phrase matcher for domain terms (invoices etc.)
-        if matches:
+phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
-            found[label] = list(set(matches))[:5]  # Limit to 5 per type
+for phrase in ["invoice number", "due date", "invoice", "total amount", "amount due", "customer name"]:
-    return found
+    phrase_matcher.add("INVOICE_FIELD", [nlp.make_doc(phrase)])
 # generic matcher to capture patterns like "Name: John Doe"
 matcher = Matcher(nlp.vocab)
 matcher.add("KV_PATTERN", [
    [{"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "?"}, {"IS_TITLE": True, "OP": "+"}],
 ])
 def find_regex_entities(text):
    emails = "; ".join(email_re.findall(text))
    phones = "; ".join(phone_re.findall(text))
    return {"EMAIL": emails, "PHONE": phones}
 # chunking to process long texts without losing context
 def chunk_text(text, max_chars=3000, overlap=200):
    start = 0
    L = len(text)
    while start < L:
        end = start + max_chars
        if end >= L:
            yield text[start:L]
            break
        # try to end at newline or space within a small window
        cut = text.rfind("\n", start, end)
        if cut <= start:
            cut = text.rfind(" ", start, end)
        if cut <= start:
            cut = end
        yield text[start:cut]
        start = max(cut - overlap, cut)  # overlap for context
 def is_likely_name(ent_text):
    # filter out very short tokens and blacklisted tokens
    if short_token_re.match(ent_text):
        return False
    if any(tok.lower() in ("invoice", "total", "amount", "date", "http", "www") for tok in ent_text.split()):
        return False
    if ent_text.split()[0] in BLACKLIST:
        return False
    return True
 def add_entity(agg, ent_text, ctx):
    rec = agg[ent_text]
    rec["count"] += 1
    if len(rec["examples"]) < 3:
        rec["examples"].append(ctx)
 def extract_with_spacy(text):
-    """Extract named entities using spaCy."""
+    persons = defaultdict(lambda: {"count": 0, "examples": []})
-    if not USE_SPACY:
+    orgs = defaultdict(lambda: {"count": 0, "examples": []})
-        return {}, {}, {}
+    locs = defaultdict(lambda: {"count": 0, "examples": []})
-    doc = nlp(text[:10000])  # Limit text length for performance
+    for chunk in chunk_text(text):
-    
+        doc = nlp(chunk)
-    persons = []
+        for _, start, end in phrase_matcher(doc):
-    orgs = []
+            span = doc[start:end]
-    locations = []
+        for match_id, start, end in matcher(doc):
            span = doc[start:end]
            # not necessarily an entity, but may give context
        for ent in doc.ents:
-        if ent.label_ == "PERSON":
+            text_ent = ent.text.strip()
-            persons.append(ent.text)
+            label = ent.label_
        elif ent.label_ == "ORG":
            orgs.append(ent.text)
        elif ent.label_ in ["GPE", "LOC"]:
            locations.append(ent.text)
-    # Return most common entities
+            # basic filtering rules
-    return (
+            if len(text_ent) < 2:
-        dict(Counter(persons).most_common(5)),
+                continue
-        dict(Counter(orgs).most_common(5)),
+            if text_ent in BLACKLIST:
-        dict(Counter(locations).most_common(5))
+                continue
-    )
+
            # context snippet for examples (trim)
            sent_ctx = ent.sent.text.strip()
            if len(sent_ctx) > 200:
                sent_ctx = sent_ctx[:200] + "..."
            # label mapping - adapt to what spaCy model returns
            if label in ("PERSON", "PER", "PERSONS"):
                if is_likely_name(text_ent) or any(t in WHITELIST_TITLES for t in text_ent.split()):
                    add_entity(persons, text_ent, sent_ctx)
            elif label in ("ORG", "ORGANIZATION", "COMPANY"):
                add_entity(orgs, text_ent, sent_ctx)
            elif label in ("GPE", "LOC", "LOCATION", "CITY", "COUNTRY"):
                add_entity(locs, text_ent, sent_ctx)
            else:
                pass
        if USE_TRF:
            for ent in doc.ents:
                try:
                    vec_norms = [t.vector_norm for t in ent]
                    avg = sum(vec_norms) / max(len(vec_norms), 1)
                    # if avg very small it's likely low-quality
                    if avg < 5.0:
                        # treat low-norm ent as lower confidence, optionally skip
                        continue
                except Exception:
                    pass
    def finalize(d):
        out = {}
        for k, v in d.items():
            out[k] = {"count": v["count"], "examples": v["examples"]}
        return out
    return finalize(persons), finalize(orgs), finalize(locs)
 def extract_metadata(text, filename):
    """Extract basic metadata from text."""
@@ -96,72 +177,60 @@ fieldnames = [
    "persons", "organizations", "locations"
 ]
 # this route will accept every txt/log file that is not csv
@app.post("/api/files")
 async def postFile(file: UploadFile):
    if file.filename.endswith("csv"):
        return {"error" : "Cannot accept csv files"}
    try:
        contents = await file.read()
        text = contents.decode("utf-8", errors="ignore")
    except Exception as e:
        return {"error": f"Could not read file: {e}"}
    # Skip empty uploads
    if not text.strip():
        return {"error": "File is empty or unreadable"}
    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", newline="", encoding="utf-8") as csvfile:
        print("Processing files...")
-with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
+        temp_path = Path(csvfile.name)
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
-    file_count = 0
+        row = {"filename" : file.filename}
    for path in ROOT.rglob("*"):
        if not path.is_file():
            continue
        # Skip binary files
        if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']:
            continue
        try:
            text = path.read_text(encoding="utf-8", errors="ignore")
        except Exception as e:
            print(f"  Error reading {path.name}: {e}")
            continue
        if not text.strip():
            continue
        file_count += 1
        if file_count % 10 == 0:
            print(f"Processed {file_count} files...")
        # Initialize row
        row = {"filename": str(path.relative_to(ROOT.parent))}
        # Extract metadata
-        metadata = extract_metadata(text, path.name)
+        metadata = extract_metadata(text, row["filename"])
        row.update(metadata)
        # Detect content type
        row["content_type"] = detect_content_type(text)
        row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
        # Extract entities with regex
        entities = find_entities(text)
        for key, values in entities.items():
            row[key] = "; ".join(values) if values else ""
        # Fill in missing pattern fields
        for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
            if pattern_key not in row:
                row[pattern_key] = ""
        # Extract named entities with spaCy
        if USE_SPACY:
        persons, orgs, locs = extract_with_spacy(text)
-            row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()])
+        regex_entities = find_regex_entities(text)
-            row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()])
+        row.update(regex_entities)
-            row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()])
+        row["persons"] = "; ".join([f"{k} ({v['count']})" for k, v in persons.items()])
-        else:
+        row["organizations"] = "; ".join([f"{k} ({v['count']})" for k, v in orgs.items()])
-            row["persons"] = ""
+        row["locations"] = "; ".join([f"{k} ({v['count']})" for k, v in locs.items()])
-            row["organizations"] = ""
+
            row["locations"] = ""
        writer.writerow(row)
        print(pprint.pprint(row))
-print(f"\nComplete! Processed {file_count} files.")
+        return FileResponse(
-print(f"Output: discovery_dataset.csv")
+            temp_path, media_type="text/csv", filename="dataset.csv"
        )
-# Print summary statistics
+if __name__ == '__main__':
-if file_count > 0:
+   uvicorn.run(app) 
    print("\nTo install spaCy for better entity extraction:")
    print("  pip install spacy")
    print("  python -m spacy download en_core_web_sm")
--- a/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx
+++ b/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx
@@ -1,7 +1,6 @@
 "use client";
 import { TryTab } from "./Sidebar";
 import { useState, useRef, useCallback, useEffect } from "react";
 import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/idb";
 interface CenterPanelProps {
 	tab: TryTab;
--- a/frontend/nordic-privacy-ai/package-lock.json
+++ b/frontend/nordic-privacy-ai/package-lock.json
@@ -1001,6 +1001,7 @@
      "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "bin": {
        "acorn": "bin/acorn"
      },
@@ -1441,6 +1442,7 @@
        }
      ],
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "baseline-browser-mapping": "^2.8.19",
        "caniuse-lite": "^1.0.30001751",
@@ -2103,6 +2105,7 @@
      "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "@eslint-community/eslint-utils": "^4.2.0",
        "@eslint-community/regexpp": "^4.6.1",
@@ -2271,6 +2274,7 @@
      "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "@rtsao/scc": "^1.1.0",
        "array-includes": "^3.1.9",
@@ -4423,6 +4427,7 @@
        }
      ],
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "nanoid": "^3.3.7",
        "picocolors": "^1.0.0",
@@ -4581,6 +4586,7 @@
      "resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz",
      "integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "loose-envify": "^1.1.0"
      },
@@ -4593,6 +4599,7 @@
      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz",
      "integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==",
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "loose-envify": "^1.1.0",
        "scheduler": "^0.23.0"
@@ -5539,6 +5546,7 @@
      "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "engines": {
        "node": ">=12"
      },
@@ -5708,6 +5716,7 @@
      "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
      "dev": true,
      "license": "Apache-2.0",
      "peer": true,
      "bin": {
        "tsc": "bin/tsc",
        "tsserver": "bin/tsserver"