fix: better text extraction

This commit is contained in:
nearlynithin
2025-11-07 14:19:44 +05:30
parent d7dbcff4b6
commit beadb7ce87
3 changed files with 178 additions and 101 deletions

View File

@@ -1,64 +1,145 @@
import csv import csv, tempfile
import re import re
from pathlib import Path from pathlib import Path
from collections import Counter from collections import Counter
from datetime import datetime from datetime import datetime
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import FileResponse
import uvicorn
import pprint
ROOT = Path("../Data/Politics") app = FastAPI()
import spacy
from spacy.matcher import PhraseMatcher, Matcher
from collections import defaultdict
import re
# Try to import spaCy, fall back to basic extraction if not available
try: try:
import spacy nlp = spacy.load("en_core_web_trf")
USE_TRF = True
except Exception:
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
USE_SPACY = True USE_TRF = False
except:
USE_SPACY = False
# Regex patterns for deterministic detection # small helper regexes for quick validation
patterns = { email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
"EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"), phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}")
"PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"), short_token_re = re.compile(r"^[A-Za-z]{1,2}$")
"UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
"IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
"DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
"URL": re.compile(r"https?://[^\s]+"),
"SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
}
def find_entities(text): # blacklist/whitelist samples (extend for your domain)
"""Extract entities using regex patterns.""" BLACKLIST = set(["The", "In", "On", "And", "If", "But"])
found = {} WHITELIST_TITLES = set(["Dr.", "Mr.", "Mrs.", "Ms.", "Prof."])
for label, pattern in patterns.items():
matches = pattern.findall(text) # optional high-precision phrase matcher for domain terms (invoices etc.)
if matches: phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
found[label] = list(set(matches))[:5] # Limit to 5 per type for phrase in ["invoice number", "due date", "invoice", "total amount", "amount due", "customer name"]:
return found phrase_matcher.add("INVOICE_FIELD", [nlp.make_doc(phrase)])
# generic matcher to capture patterns like "Name: John Doe"
matcher = Matcher(nlp.vocab)
matcher.add("KV_PATTERN", [
[{"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "?"}, {"IS_TITLE": True, "OP": "+"}],
])
def find_regex_entities(text):
emails = "; ".join(email_re.findall(text))
phones = "; ".join(phone_re.findall(text))
return {"EMAIL": emails, "PHONE": phones}
# chunking to process long texts without losing context
def chunk_text(text, max_chars=3000, overlap=200):
start = 0
L = len(text)
while start < L:
end = start + max_chars
if end >= L:
yield text[start:L]
break
# try to end at newline or space within a small window
cut = text.rfind("\n", start, end)
if cut <= start:
cut = text.rfind(" ", start, end)
if cut <= start:
cut = end
yield text[start:cut]
start = max(cut - overlap, cut) # overlap for context
def is_likely_name(ent_text):
# filter out very short tokens and blacklisted tokens
if short_token_re.match(ent_text):
return False
if any(tok.lower() in ("invoice", "total", "amount", "date", "http", "www") for tok in ent_text.split()):
return False
if ent_text.split()[0] in BLACKLIST:
return False
return True
def add_entity(agg, ent_text, ctx):
rec = agg[ent_text]
rec["count"] += 1
if len(rec["examples"]) < 3:
rec["examples"].append(ctx)
def extract_with_spacy(text): def extract_with_spacy(text):
"""Extract named entities using spaCy.""" persons = defaultdict(lambda: {"count": 0, "examples": []})
if not USE_SPACY: orgs = defaultdict(lambda: {"count": 0, "examples": []})
return {}, {}, {} locs = defaultdict(lambda: {"count": 0, "examples": []})
doc = nlp(text[:10000]) # Limit text length for performance for chunk in chunk_text(text):
doc = nlp(chunk)
persons = [] for _, start, end in phrase_matcher(doc):
orgs = [] span = doc[start:end]
locations = [] for match_id, start, end in matcher(doc):
span = doc[start:end]
for ent in doc.ents: # not necessarily an entity, but may give context
if ent.label_ == "PERSON":
persons.append(ent.text) for ent in doc.ents:
elif ent.label_ == "ORG": text_ent = ent.text.strip()
orgs.append(ent.text) label = ent.label_
elif ent.label_ in ["GPE", "LOC"]:
locations.append(ent.text) # basic filtering rules
if len(text_ent) < 2:
# Return most common entities continue
return ( if text_ent in BLACKLIST:
dict(Counter(persons).most_common(5)), continue
dict(Counter(orgs).most_common(5)),
dict(Counter(locations).most_common(5)) # context snippet for examples (trim)
) sent_ctx = ent.sent.text.strip()
if len(sent_ctx) > 200:
sent_ctx = sent_ctx[:200] + "..."
# label mapping - adapt to what spaCy model returns
if label in ("PERSON", "PER", "PERSONS"):
if is_likely_name(text_ent) or any(t in WHITELIST_TITLES for t in text_ent.split()):
add_entity(persons, text_ent, sent_ctx)
elif label in ("ORG", "ORGANIZATION", "COMPANY"):
add_entity(orgs, text_ent, sent_ctx)
elif label in ("GPE", "LOC", "LOCATION", "CITY", "COUNTRY"):
add_entity(locs, text_ent, sent_ctx)
else:
pass
if USE_TRF:
for ent in doc.ents:
try:
vec_norms = [t.vector_norm for t in ent]
avg = sum(vec_norms) / max(len(vec_norms), 1)
# if avg very small it's likely low-quality
if avg < 5.0:
# treat low-norm ent as lower confidence, optionally skip
continue
except Exception:
pass
def finalize(d):
out = {}
for k, v in d.items():
out[k] = {"count": v["count"], "examples": v["examples"]}
return out
return finalize(persons), finalize(orgs), finalize(locs)
def extract_metadata(text, filename): def extract_metadata(text, filename):
"""Extract basic metadata from text.""" """Extract basic metadata from text."""
@@ -96,72 +177,60 @@ fieldnames = [
"persons", "organizations", "locations" "persons", "organizations", "locations"
] ]
print("Processing files...")
with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile: # this route will accept every txt/log file that is not csv
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) @app.post("/api/files")
writer.writeheader() async def postFile(file: UploadFile):
if file.filename.endswith("csv"):
file_count = 0 return {"error" : "Cannot accept csv files"}
for path in ROOT.rglob("*"):
if not path.is_file(): try:
continue contents = await file.read()
text = contents.decode("utf-8", errors="ignore")
# Skip binary files except Exception as e:
if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']: return {"error": f"Could not read file: {e}"}
continue
# Skip empty uploads
try: if not text.strip():
text = path.read_text(encoding="utf-8", errors="ignore") return {"error": "File is empty or unreadable"}
except Exception as e:
print(f" Error reading {path.name}: {e}") with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", newline="", encoding="utf-8") as csvfile:
continue print("Processing files...")
temp_path = Path(csvfile.name)
if not text.strip(): writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
continue writer.writeheader()
file_count += 1 row = {"filename" : file.filename}
if file_count % 10 == 0:
print(f"Processed {file_count} files...")
# Initialize row
row = {"filename": str(path.relative_to(ROOT.parent))}
# Extract metadata # Extract metadata
metadata = extract_metadata(text, path.name) metadata = extract_metadata(text, row["filename"])
row.update(metadata) row.update(metadata)
# Detect content type # Detect content type
row["content_type"] = detect_content_type(text) row["content_type"] = detect_content_type(text)
row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ') row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
# Extract entities with regex
entities = find_entities(text)
for key, values in entities.items():
row[key] = "; ".join(values) if values else ""
# Fill in missing pattern fields # Fill in missing pattern fields
for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]: for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
if pattern_key not in row: if pattern_key not in row:
row[pattern_key] = "" row[pattern_key] = ""
# Extract named entities with spaCy persons, orgs, locs = extract_with_spacy(text)
if USE_SPACY: regex_entities = find_regex_entities(text)
persons, orgs, locs = extract_with_spacy(text) row.update(regex_entities)
row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()]) row["persons"] = "; ".join([f"{k} ({v['count']})" for k, v in persons.items()])
row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()]) row["organizations"] = "; ".join([f"{k} ({v['count']})" for k, v in orgs.items()])
row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()]) row["locations"] = "; ".join([f"{k} ({v['count']})" for k, v in locs.items()])
else:
row["persons"] = ""
row["organizations"] = ""
row["locations"] = ""
writer.writerow(row) writer.writerow(row)
print(pprint.pprint(row))
print(f"\nComplete! Processed {file_count} files.") return FileResponse(
print(f"Output: discovery_dataset.csv") temp_path, media_type="text/csv", filename="dataset.csv"
)
# Print summary statistics if __name__ == '__main__':
if file_count > 0: uvicorn.run(app)
print("\nTo install spaCy for better entity extraction:")
print(" pip install spacy")
print(" python -m spacy download en_core_web_sm")

View File

@@ -1,7 +1,6 @@
"use client"; "use client";
import { TryTab } from "./Sidebar"; import { TryTab } from "./Sidebar";
import { useState, useRef, useCallback, useEffect } from "react"; import { useState, useRef, useCallback, useEffect } from "react";
import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/idb";
interface CenterPanelProps { interface CenterPanelProps {
tab: TryTab; tab: TryTab;

View File

@@ -1001,6 +1001,7 @@
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"peer": true,
"bin": { "bin": {
"acorn": "bin/acorn" "acorn": "bin/acorn"
}, },
@@ -1441,6 +1442,7 @@
} }
], ],
"license": "MIT", "license": "MIT",
"peer": true,
"dependencies": { "dependencies": {
"baseline-browser-mapping": "^2.8.19", "baseline-browser-mapping": "^2.8.19",
"caniuse-lite": "^1.0.30001751", "caniuse-lite": "^1.0.30001751",
@@ -2103,6 +2105,7 @@
"deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.", "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"peer": true,
"dependencies": { "dependencies": {
"@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/eslint-utils": "^4.2.0",
"@eslint-community/regexpp": "^4.6.1", "@eslint-community/regexpp": "^4.6.1",
@@ -2271,6 +2274,7 @@
"integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"peer": true,
"dependencies": { "dependencies": {
"@rtsao/scc": "^1.1.0", "@rtsao/scc": "^1.1.0",
"array-includes": "^3.1.9", "array-includes": "^3.1.9",
@@ -4423,6 +4427,7 @@
} }
], ],
"license": "MIT", "license": "MIT",
"peer": true,
"dependencies": { "dependencies": {
"nanoid": "^3.3.7", "nanoid": "^3.3.7",
"picocolors": "^1.0.0", "picocolors": "^1.0.0",
@@ -4581,6 +4586,7 @@
"resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz", "resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz",
"integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==", "integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
"license": "MIT", "license": "MIT",
"peer": true,
"dependencies": { "dependencies": {
"loose-envify": "^1.1.0" "loose-envify": "^1.1.0"
}, },
@@ -4593,6 +4599,7 @@
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz", "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz",
"integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==", "integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==",
"license": "MIT", "license": "MIT",
"peer": true,
"dependencies": { "dependencies": {
"loose-envify": "^1.1.0", "loose-envify": "^1.1.0",
"scheduler": "^0.23.0" "scheduler": "^0.23.0"
@@ -5539,6 +5546,7 @@
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"peer": true,
"engines": { "engines": {
"node": ">=12" "node": ">=12"
}, },
@@ -5708,6 +5716,7 @@
"integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==", "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
"dev": true, "dev": true,
"license": "Apache-2.0", "license": "Apache-2.0",
"peer": true,
"bin": { "bin": {
"tsc": "bin/tsc", "tsc": "bin/tsc",
"tsserver": "bin/tsserver" "tsserver": "bin/tsserver"