fix: better text extraction

This commit is contained in:
nearlynithin
2025-11-07 14:19:44 +05:30
parent d7dbcff4b6
commit beadb7ce87
3 changed files with 178 additions and 101 deletions

View File

@@ -1,64 +1,145 @@
import csv
import csv, tempfile
import re
from pathlib import Path
from collections import Counter
from datetime import datetime
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import FileResponse
import uvicorn
import pprint
ROOT = Path("../Data/Politics")
app = FastAPI()
import spacy
from spacy.matcher import PhraseMatcher, Matcher
from collections import defaultdict
import re
# Try to import spaCy, fall back to basic extraction if not available
try:
import spacy
nlp = spacy.load("en_core_web_trf")
USE_TRF = True
except Exception:
nlp = spacy.load("en_core_web_sm")
USE_SPACY = True
except:
USE_SPACY = False
USE_TRF = False
# Regex patterns for deterministic detection
patterns = {
"EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"),
"PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"),
"UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
"IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
"DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
"URL": re.compile(r"https?://[^\s]+"),
"SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
}
# small helper regexes for quick validation
email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}")
short_token_re = re.compile(r"^[A-Za-z]{1,2}$")
def find_entities(text):
"""Extract entities using regex patterns."""
found = {}
for label, pattern in patterns.items():
matches = pattern.findall(text)
if matches:
found[label] = list(set(matches))[:5] # Limit to 5 per type
return found
# blacklist/whitelist samples (extend for your domain)
BLACKLIST = set(["The", "In", "On", "And", "If", "But"])
WHITELIST_TITLES = set(["Dr.", "Mr.", "Mrs.", "Ms.", "Prof."])
# optional high-precision phrase matcher for domain terms (invoices etc.)
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
for phrase in ["invoice number", "due date", "invoice", "total amount", "amount due", "customer name"]:
phrase_matcher.add("INVOICE_FIELD", [nlp.make_doc(phrase)])
# generic matcher to capture patterns like "Name: John Doe"
matcher = Matcher(nlp.vocab)
matcher.add("KV_PATTERN", [
[{"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "?"}, {"IS_TITLE": True, "OP": "+"}],
])
def find_regex_entities(text):
emails = "; ".join(email_re.findall(text))
phones = "; ".join(phone_re.findall(text))
return {"EMAIL": emails, "PHONE": phones}
# chunking to process long texts without losing context
def chunk_text(text, max_chars=3000, overlap=200):
start = 0
L = len(text)
while start < L:
end = start + max_chars
if end >= L:
yield text[start:L]
break
# try to end at newline or space within a small window
cut = text.rfind("\n", start, end)
if cut <= start:
cut = text.rfind(" ", start, end)
if cut <= start:
cut = end
yield text[start:cut]
start = max(cut - overlap, cut) # overlap for context
def is_likely_name(ent_text):
# filter out very short tokens and blacklisted tokens
if short_token_re.match(ent_text):
return False
if any(tok.lower() in ("invoice", "total", "amount", "date", "http", "www") for tok in ent_text.split()):
return False
if ent_text.split()[0] in BLACKLIST:
return False
return True
def add_entity(agg, ent_text, ctx):
rec = agg[ent_text]
rec["count"] += 1
if len(rec["examples"]) < 3:
rec["examples"].append(ctx)
def extract_with_spacy(text):
"""Extract named entities using spaCy."""
if not USE_SPACY:
return {}, {}, {}
persons = defaultdict(lambda: {"count": 0, "examples": []})
orgs = defaultdict(lambda: {"count": 0, "examples": []})
locs = defaultdict(lambda: {"count": 0, "examples": []})
doc = nlp(text[:10000]) # Limit text length for performance
for chunk in chunk_text(text):
doc = nlp(chunk)
for _, start, end in phrase_matcher(doc):
span = doc[start:end]
for match_id, start, end in matcher(doc):
span = doc[start:end]
# not necessarily an entity, but may give context
persons = []
orgs = []
locations = []
for ent in doc.ents:
text_ent = ent.text.strip()
label = ent.label_
for ent in doc.ents:
if ent.label_ == "PERSON":
persons.append(ent.text)
elif ent.label_ == "ORG":
orgs.append(ent.text)
elif ent.label_ in ["GPE", "LOC"]:
locations.append(ent.text)
# basic filtering rules
if len(text_ent) < 2:
continue
if text_ent in BLACKLIST:
continue
# Return most common entities
return (
dict(Counter(persons).most_common(5)),
dict(Counter(orgs).most_common(5)),
dict(Counter(locations).most_common(5))
)
# context snippet for examples (trim)
sent_ctx = ent.sent.text.strip()
if len(sent_ctx) > 200:
sent_ctx = sent_ctx[:200] + "..."
# label mapping - adapt to what spaCy model returns
if label in ("PERSON", "PER", "PERSONS"):
if is_likely_name(text_ent) or any(t in WHITELIST_TITLES for t in text_ent.split()):
add_entity(persons, text_ent, sent_ctx)
elif label in ("ORG", "ORGANIZATION", "COMPANY"):
add_entity(orgs, text_ent, sent_ctx)
elif label in ("GPE", "LOC", "LOCATION", "CITY", "COUNTRY"):
add_entity(locs, text_ent, sent_ctx)
else:
pass
if USE_TRF:
for ent in doc.ents:
try:
vec_norms = [t.vector_norm for t in ent]
avg = sum(vec_norms) / max(len(vec_norms), 1)
# if avg very small it's likely low-quality
if avg < 5.0:
# treat low-norm ent as lower confidence, optionally skip
continue
except Exception:
pass
def finalize(d):
out = {}
for k, v in d.items():
out[k] = {"count": v["count"], "examples": v["examples"]}
return out
return finalize(persons), finalize(orgs), finalize(locs)
def extract_metadata(text, filename):
"""Extract basic metadata from text."""
@@ -96,72 +177,60 @@ fieldnames = [
"persons", "organizations", "locations"
]
print("Processing files...")
with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
file_count = 0
for path in ROOT.rglob("*"):
if not path.is_file():
continue
# this route will accept every txt/log file that is not csv
@app.post("/api/files")
async def postFile(file: UploadFile):
if file.filename.endswith("csv"):
return {"error" : "Cannot accept csv files"}
# Skip binary files
if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']:
continue
try:
contents = await file.read()
text = contents.decode("utf-8", errors="ignore")
except Exception as e:
return {"error": f"Could not read file: {e}"}
try:
text = path.read_text(encoding="utf-8", errors="ignore")
except Exception as e:
print(f" Error reading {path.name}: {e}")
continue
# Skip empty uploads
if not text.strip():
return {"error": "File is empty or unreadable"}
if not text.strip():
continue
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", newline="", encoding="utf-8") as csvfile:
print("Processing files...")
temp_path = Path(csvfile.name)
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
file_count += 1
if file_count % 10 == 0:
print(f"Processed {file_count} files...")
row = {"filename" : file.filename}
# Initialize row
row = {"filename": str(path.relative_to(ROOT.parent))}
# Extract metadata
metadata = extract_metadata(text, path.name)
metadata = extract_metadata(text, row["filename"])
row.update(metadata)
# Detect content type
row["content_type"] = detect_content_type(text)
row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
# Extract entities with regex
entities = find_entities(text)
for key, values in entities.items():
row[key] = "; ".join(values) if values else ""
# Fill in missing pattern fields
for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
if pattern_key not in row:
row[pattern_key] = ""
# Extract named entities with spaCy
if USE_SPACY:
persons, orgs, locs = extract_with_spacy(text)
row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()])
row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()])
row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()])
else:
row["persons"] = ""
row["organizations"] = ""
row["locations"] = ""
persons, orgs, locs = extract_with_spacy(text)
regex_entities = find_regex_entities(text)
row.update(regex_entities)
row["persons"] = "; ".join([f"{k} ({v['count']})" for k, v in persons.items()])
row["organizations"] = "; ".join([f"{k} ({v['count']})" for k, v in orgs.items()])
row["locations"] = "; ".join([f"{k} ({v['count']})" for k, v in locs.items()])
writer.writerow(row)
print(pprint.pprint(row))
print(f"\nComplete! Processed {file_count} files.")
print(f"Output: discovery_dataset.csv")
return FileResponse(
temp_path, media_type="text/csv", filename="dataset.csv"
)
# Print summary statistics
if file_count > 0:
print("\nTo install spaCy for better entity extraction:")
print(" pip install spacy")
print(" python -m spacy download en_core_web_sm")
if __name__ == '__main__':
uvicorn.run(app)

View File

@@ -1,7 +1,6 @@
"use client";
import { TryTab } from "./Sidebar";
import { useState, useRef, useCallback, useEffect } from "react";
import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/idb";
interface CenterPanelProps {
tab: TryTab;

View File

@@ -1001,6 +1001,7 @@
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
"dev": true,
"license": "MIT",
"peer": true,
"bin": {
"acorn": "bin/acorn"
},
@@ -1441,6 +1442,7 @@
}
],
"license": "MIT",
"peer": true,
"dependencies": {
"baseline-browser-mapping": "^2.8.19",
"caniuse-lite": "^1.0.30001751",
@@ -2103,6 +2105,7 @@
"deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@eslint-community/eslint-utils": "^4.2.0",
"@eslint-community/regexpp": "^4.6.1",
@@ -2271,6 +2274,7 @@
"integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@rtsao/scc": "^1.1.0",
"array-includes": "^3.1.9",
@@ -4423,6 +4427,7 @@
}
],
"license": "MIT",
"peer": true,
"dependencies": {
"nanoid": "^3.3.7",
"picocolors": "^1.0.0",
@@ -4581,6 +4586,7 @@
"resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz",
"integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
"license": "MIT",
"peer": true,
"dependencies": {
"loose-envify": "^1.1.0"
},
@@ -4593,6 +4599,7 @@
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz",
"integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==",
"license": "MIT",
"peer": true,
"dependencies": {
"loose-envify": "^1.1.0",
"scheduler": "^0.23.0"
@@ -5539,6 +5546,7 @@
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
"dev": true,
"license": "MIT",
"peer": true,
"engines": {
"node": ">=12"
},
@@ -5708,6 +5716,7 @@
"integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
"dev": true,
"license": "Apache-2.0",
"peer": true,
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"