From beadb7ce87750642f0837079a17f8c96ee70954e Mon Sep 17 00:00:00 2001 From: nearlynithin Date: Fri, 7 Nov 2025 14:19:44 +0530 Subject: [PATCH] fix: better text extraction --- discovery/main.py | 269 +++++++++++------- .../components/try/CenterPanel.tsx | 1 - frontend/nordic-privacy-ai/package-lock.json | 9 + 3 files changed, 178 insertions(+), 101 deletions(-) diff --git a/discovery/main.py b/discovery/main.py index 96fa8e7..789c997 100644 --- a/discovery/main.py +++ b/discovery/main.py @@ -1,64 +1,145 @@ -import csv +import csv, tempfile import re from pathlib import Path from collections import Counter from datetime import datetime +from fastapi import FastAPI, File, UploadFile +from fastapi.responses import FileResponse +import uvicorn +import pprint -ROOT = Path("../Data/Politics") +app = FastAPI() + +import spacy +from spacy.matcher import PhraseMatcher, Matcher +from collections import defaultdict +import re -# Try to import spaCy, fall back to basic extraction if not available try: - import spacy + nlp = spacy.load("en_core_web_trf") + USE_TRF = True +except Exception: nlp = spacy.load("en_core_web_sm") - USE_SPACY = True -except: - USE_SPACY = False + USE_TRF = False -# Regex patterns for deterministic detection -patterns = { - "EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"), - "PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"), - "UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"), - "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"), - "DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"), - "URL": re.compile(r"https?://[^\s]+"), - "SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), -} +# small helper regexes for quick validation +email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") +phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}") +short_token_re = re.compile(r"^[A-Za-z]{1,2}$") -def find_entities(text): - """Extract entities using regex patterns.""" - found = {} - for label, pattern in patterns.items(): - matches = pattern.findall(text) - if matches: - found[label] = list(set(matches))[:5] # Limit to 5 per type - return found +# blacklist/whitelist samples (extend for your domain) +BLACKLIST = set(["The", "In", "On", "And", "If", "But"]) +WHITELIST_TITLES = set(["Dr.", "Mr.", "Mrs.", "Ms.", "Prof."]) + +# optional high-precision phrase matcher for domain terms (invoices etc.) +phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") +for phrase in ["invoice number", "due date", "invoice", "total amount", "amount due", "customer name"]: + phrase_matcher.add("INVOICE_FIELD", [nlp.make_doc(phrase)]) + +# generic matcher to capture patterns like "Name: John Doe" +matcher = Matcher(nlp.vocab) +matcher.add("KV_PATTERN", [ + [{"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "?"}, {"IS_TITLE": True, "OP": "+"}], +]) + +def find_regex_entities(text): + emails = "; ".join(email_re.findall(text)) + phones = "; ".join(phone_re.findall(text)) + return {"EMAIL": emails, "PHONE": phones} + + +# chunking to process long texts without losing context +def chunk_text(text, max_chars=3000, overlap=200): + start = 0 + L = len(text) + while start < L: + end = start + max_chars + if end >= L: + yield text[start:L] + break + # try to end at newline or space within a small window + cut = text.rfind("\n", start, end) + if cut <= start: + cut = text.rfind(" ", start, end) + if cut <= start: + cut = end + yield text[start:cut] + start = max(cut - overlap, cut) # overlap for context + +def is_likely_name(ent_text): + # filter out very short tokens and blacklisted tokens + if short_token_re.match(ent_text): + return False + if any(tok.lower() in ("invoice", "total", "amount", "date", "http", "www") for tok in ent_text.split()): + return False + if ent_text.split()[0] in BLACKLIST: + return False + return True + +def add_entity(agg, ent_text, ctx): + rec = agg[ent_text] + rec["count"] += 1 + if len(rec["examples"]) < 3: + rec["examples"].append(ctx) def extract_with_spacy(text): - """Extract named entities using spaCy.""" - if not USE_SPACY: - return {}, {}, {} - - doc = nlp(text[:10000]) # Limit text length for performance - - persons = [] - orgs = [] - locations = [] - - for ent in doc.ents: - if ent.label_ == "PERSON": - persons.append(ent.text) - elif ent.label_ == "ORG": - orgs.append(ent.text) - elif ent.label_ in ["GPE", "LOC"]: - locations.append(ent.text) - - # Return most common entities - return ( - dict(Counter(persons).most_common(5)), - dict(Counter(orgs).most_common(5)), - dict(Counter(locations).most_common(5)) - ) + persons = defaultdict(lambda: {"count": 0, "examples": []}) + orgs = defaultdict(lambda: {"count": 0, "examples": []}) + locs = defaultdict(lambda: {"count": 0, "examples": []}) + + for chunk in chunk_text(text): + doc = nlp(chunk) + for _, start, end in phrase_matcher(doc): + span = doc[start:end] + for match_id, start, end in matcher(doc): + span = doc[start:end] + # not necessarily an entity, but may give context + + for ent in doc.ents: + text_ent = ent.text.strip() + label = ent.label_ + + # basic filtering rules + if len(text_ent) < 2: + continue + if text_ent in BLACKLIST: + continue + + # context snippet for examples (trim) + sent_ctx = ent.sent.text.strip() + if len(sent_ctx) > 200: + sent_ctx = sent_ctx[:200] + "..." + + # label mapping - adapt to what spaCy model returns + if label in ("PERSON", "PER", "PERSONS"): + if is_likely_name(text_ent) or any(t in WHITELIST_TITLES for t in text_ent.split()): + add_entity(persons, text_ent, sent_ctx) + elif label in ("ORG", "ORGANIZATION", "COMPANY"): + add_entity(orgs, text_ent, sent_ctx) + elif label in ("GPE", "LOC", "LOCATION", "CITY", "COUNTRY"): + add_entity(locs, text_ent, sent_ctx) + else: + pass + + if USE_TRF: + for ent in doc.ents: + try: + vec_norms = [t.vector_norm for t in ent] + avg = sum(vec_norms) / max(len(vec_norms), 1) + # if avg very small it's likely low-quality + if avg < 5.0: + # treat low-norm ent as lower confidence, optionally skip + continue + except Exception: + pass + + def finalize(d): + out = {} + for k, v in d.items(): + out[k] = {"count": v["count"], "examples": v["examples"]} + return out + + return finalize(persons), finalize(orgs), finalize(locs) def extract_metadata(text, filename): """Extract basic metadata from text.""" @@ -96,72 +177,60 @@ fieldnames = [ "persons", "organizations", "locations" ] -print("Processing files...") -with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - - file_count = 0 - for path in ROOT.rglob("*"): - if not path.is_file(): - continue - - # Skip binary files - if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']: - continue - - try: - text = path.read_text(encoding="utf-8", errors="ignore") - except Exception as e: - print(f" Error reading {path.name}: {e}") - continue - - if not text.strip(): - continue - - file_count += 1 - if file_count % 10 == 0: - print(f"Processed {file_count} files...") - - # Initialize row - row = {"filename": str(path.relative_to(ROOT.parent))} + +# this route will accept every txt/log file that is not csv +@app.post("/api/files") +async def postFile(file: UploadFile): + if file.filename.endswith("csv"): + return {"error" : "Cannot accept csv files"} + + try: + contents = await file.read() + text = contents.decode("utf-8", errors="ignore") + except Exception as e: + return {"error": f"Could not read file: {e}"} + + # Skip empty uploads + if not text.strip(): + return {"error": "File is empty or unreadable"} + + with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", newline="", encoding="utf-8") as csvfile: + print("Processing files...") + temp_path = Path(csvfile.name) + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + row = {"filename" : file.filename} + # Extract metadata - metadata = extract_metadata(text, path.name) + metadata = extract_metadata(text, row["filename"]) row.update(metadata) # Detect content type row["content_type"] = detect_content_type(text) row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ') - # Extract entities with regex - entities = find_entities(text) - for key, values in entities.items(): - row[key] = "; ".join(values) if values else "" # Fill in missing pattern fields for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]: if pattern_key not in row: row[pattern_key] = "" - # Extract named entities with spaCy - if USE_SPACY: - persons, orgs, locs = extract_with_spacy(text) - row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()]) - row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()]) - row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()]) - else: - row["persons"] = "" - row["organizations"] = "" - row["locations"] = "" + persons, orgs, locs = extract_with_spacy(text) + regex_entities = find_regex_entities(text) + row.update(regex_entities) + row["persons"] = "; ".join([f"{k} ({v['count']})" for k, v in persons.items()]) + row["organizations"] = "; ".join([f"{k} ({v['count']})" for k, v in orgs.items()]) + row["locations"] = "; ".join([f"{k} ({v['count']})" for k, v in locs.items()]) + writer.writerow(row) + print(pprint.pprint(row)) -print(f"\nComplete! Processed {file_count} files.") -print(f"Output: discovery_dataset.csv") + return FileResponse( + temp_path, media_type="text/csv", filename="dataset.csv" + ) -# Print summary statistics -if file_count > 0: - print("\nTo install spaCy for better entity extraction:") - print(" pip install spacy") - print(" python -m spacy download en_core_web_sm") \ No newline at end of file +if __name__ == '__main__': + uvicorn.run(app) \ No newline at end of file diff --git a/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx b/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx index 73c8a2c..e8239c7 100644 --- a/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx +++ b/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx @@ -1,7 +1,6 @@ "use client"; import { TryTab } from "./Sidebar"; import { useState, useRef, useCallback, useEffect } from "react"; -import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/idb"; interface CenterPanelProps { tab: TryTab; diff --git a/frontend/nordic-privacy-ai/package-lock.json b/frontend/nordic-privacy-ai/package-lock.json index 3f2a3d8..46f6cd8 100644 --- a/frontend/nordic-privacy-ai/package-lock.json +++ b/frontend/nordic-privacy-ai/package-lock.json @@ -1001,6 +1001,7 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -1441,6 +1442,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "baseline-browser-mapping": "^2.8.19", "caniuse-lite": "^1.0.30001751", @@ -2103,6 +2105,7 @@ "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.6.1", @@ -2271,6 +2274,7 @@ "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@rtsao/scc": "^1.1.0", "array-includes": "^3.1.9", @@ -4423,6 +4427,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "nanoid": "^3.3.7", "picocolors": "^1.0.0", @@ -4581,6 +4586,7 @@ "resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz", "integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -4593,6 +4599,7 @@ "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz", "integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.0" @@ -5539,6 +5546,7 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -5708,6 +5716,7 @@ "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==", "dev": true, "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver"