mirror of
https://github.com/PlatypusPus/MushroomEmpire.git
synced 2026-02-07 22:18:59 +00:00
fix: better text extraction
This commit is contained in:
@@ -1,64 +1,145 @@
|
|||||||
import csv
|
import csv, tempfile
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from fastapi import FastAPI, File, UploadFile
|
||||||
|
from fastapi.responses import FileResponse
|
||||||
|
import uvicorn
|
||||||
|
import pprint
|
||||||
|
|
||||||
ROOT = Path("../Data/Politics")
|
app = FastAPI()
|
||||||
|
|
||||||
# Try to import spaCy, fall back to basic extraction if not available
|
|
||||||
try:
|
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.matcher import PhraseMatcher, Matcher
|
||||||
|
from collections import defaultdict
|
||||||
|
import re
|
||||||
|
|
||||||
|
try:
|
||||||
|
nlp = spacy.load("en_core_web_trf")
|
||||||
|
USE_TRF = True
|
||||||
|
except Exception:
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
USE_SPACY = True
|
USE_TRF = False
|
||||||
except:
|
|
||||||
USE_SPACY = False
|
|
||||||
|
|
||||||
# Regex patterns for deterministic detection
|
# small helper regexes for quick validation
|
||||||
patterns = {
|
email_re = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
|
||||||
"EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"),
|
phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}")
|
||||||
"PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"),
|
short_token_re = re.compile(r"^[A-Za-z]{1,2}$")
|
||||||
"UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
|
|
||||||
"IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
|
|
||||||
"DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
|
|
||||||
"URL": re.compile(r"https?://[^\s]+"),
|
|
||||||
"SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
|
|
||||||
}
|
|
||||||
|
|
||||||
def find_entities(text):
|
# blacklist/whitelist samples (extend for your domain)
|
||||||
"""Extract entities using regex patterns."""
|
BLACKLIST = set(["The", "In", "On", "And", "If", "But"])
|
||||||
found = {}
|
WHITELIST_TITLES = set(["Dr.", "Mr.", "Mrs.", "Ms.", "Prof."])
|
||||||
for label, pattern in patterns.items():
|
|
||||||
matches = pattern.findall(text)
|
# optional high-precision phrase matcher for domain terms (invoices etc.)
|
||||||
if matches:
|
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
|
||||||
found[label] = list(set(matches))[:5] # Limit to 5 per type
|
for phrase in ["invoice number", "due date", "invoice", "total amount", "amount due", "customer name"]:
|
||||||
return found
|
phrase_matcher.add("INVOICE_FIELD", [nlp.make_doc(phrase)])
|
||||||
|
|
||||||
|
# generic matcher to capture patterns like "Name: John Doe"
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
matcher.add("KV_PATTERN", [
|
||||||
|
[{"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "?"}, {"IS_TITLE": True, "OP": "+"}],
|
||||||
|
])
|
||||||
|
|
||||||
|
def find_regex_entities(text):
|
||||||
|
emails = "; ".join(email_re.findall(text))
|
||||||
|
phones = "; ".join(phone_re.findall(text))
|
||||||
|
return {"EMAIL": emails, "PHONE": phones}
|
||||||
|
|
||||||
|
|
||||||
|
# chunking to process long texts without losing context
|
||||||
|
def chunk_text(text, max_chars=3000, overlap=200):
|
||||||
|
start = 0
|
||||||
|
L = len(text)
|
||||||
|
while start < L:
|
||||||
|
end = start + max_chars
|
||||||
|
if end >= L:
|
||||||
|
yield text[start:L]
|
||||||
|
break
|
||||||
|
# try to end at newline or space within a small window
|
||||||
|
cut = text.rfind("\n", start, end)
|
||||||
|
if cut <= start:
|
||||||
|
cut = text.rfind(" ", start, end)
|
||||||
|
if cut <= start:
|
||||||
|
cut = end
|
||||||
|
yield text[start:cut]
|
||||||
|
start = max(cut - overlap, cut) # overlap for context
|
||||||
|
|
||||||
|
def is_likely_name(ent_text):
|
||||||
|
# filter out very short tokens and blacklisted tokens
|
||||||
|
if short_token_re.match(ent_text):
|
||||||
|
return False
|
||||||
|
if any(tok.lower() in ("invoice", "total", "amount", "date", "http", "www") for tok in ent_text.split()):
|
||||||
|
return False
|
||||||
|
if ent_text.split()[0] in BLACKLIST:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def add_entity(agg, ent_text, ctx):
|
||||||
|
rec = agg[ent_text]
|
||||||
|
rec["count"] += 1
|
||||||
|
if len(rec["examples"]) < 3:
|
||||||
|
rec["examples"].append(ctx)
|
||||||
|
|
||||||
def extract_with_spacy(text):
|
def extract_with_spacy(text):
|
||||||
"""Extract named entities using spaCy."""
|
persons = defaultdict(lambda: {"count": 0, "examples": []})
|
||||||
if not USE_SPACY:
|
orgs = defaultdict(lambda: {"count": 0, "examples": []})
|
||||||
return {}, {}, {}
|
locs = defaultdict(lambda: {"count": 0, "examples": []})
|
||||||
|
|
||||||
doc = nlp(text[:10000]) # Limit text length for performance
|
for chunk in chunk_text(text):
|
||||||
|
doc = nlp(chunk)
|
||||||
persons = []
|
for _, start, end in phrase_matcher(doc):
|
||||||
orgs = []
|
span = doc[start:end]
|
||||||
locations = []
|
for match_id, start, end in matcher(doc):
|
||||||
|
span = doc[start:end]
|
||||||
|
# not necessarily an entity, but may give context
|
||||||
|
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
if ent.label_ == "PERSON":
|
text_ent = ent.text.strip()
|
||||||
persons.append(ent.text)
|
label = ent.label_
|
||||||
elif ent.label_ == "ORG":
|
|
||||||
orgs.append(ent.text)
|
|
||||||
elif ent.label_ in ["GPE", "LOC"]:
|
|
||||||
locations.append(ent.text)
|
|
||||||
|
|
||||||
# Return most common entities
|
# basic filtering rules
|
||||||
return (
|
if len(text_ent) < 2:
|
||||||
dict(Counter(persons).most_common(5)),
|
continue
|
||||||
dict(Counter(orgs).most_common(5)),
|
if text_ent in BLACKLIST:
|
||||||
dict(Counter(locations).most_common(5))
|
continue
|
||||||
)
|
|
||||||
|
# context snippet for examples (trim)
|
||||||
|
sent_ctx = ent.sent.text.strip()
|
||||||
|
if len(sent_ctx) > 200:
|
||||||
|
sent_ctx = sent_ctx[:200] + "..."
|
||||||
|
|
||||||
|
# label mapping - adapt to what spaCy model returns
|
||||||
|
if label in ("PERSON", "PER", "PERSONS"):
|
||||||
|
if is_likely_name(text_ent) or any(t in WHITELIST_TITLES for t in text_ent.split()):
|
||||||
|
add_entity(persons, text_ent, sent_ctx)
|
||||||
|
elif label in ("ORG", "ORGANIZATION", "COMPANY"):
|
||||||
|
add_entity(orgs, text_ent, sent_ctx)
|
||||||
|
elif label in ("GPE", "LOC", "LOCATION", "CITY", "COUNTRY"):
|
||||||
|
add_entity(locs, text_ent, sent_ctx)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if USE_TRF:
|
||||||
|
for ent in doc.ents:
|
||||||
|
try:
|
||||||
|
vec_norms = [t.vector_norm for t in ent]
|
||||||
|
avg = sum(vec_norms) / max(len(vec_norms), 1)
|
||||||
|
# if avg very small it's likely low-quality
|
||||||
|
if avg < 5.0:
|
||||||
|
# treat low-norm ent as lower confidence, optionally skip
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def finalize(d):
|
||||||
|
out = {}
|
||||||
|
for k, v in d.items():
|
||||||
|
out[k] = {"count": v["count"], "examples": v["examples"]}
|
||||||
|
return out
|
||||||
|
|
||||||
|
return finalize(persons), finalize(orgs), finalize(locs)
|
||||||
|
|
||||||
def extract_metadata(text, filename):
|
def extract_metadata(text, filename):
|
||||||
"""Extract basic metadata from text."""
|
"""Extract basic metadata from text."""
|
||||||
@@ -96,72 +177,60 @@ fieldnames = [
|
|||||||
"persons", "organizations", "locations"
|
"persons", "organizations", "locations"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# this route will accept every txt/log file that is not csv
|
||||||
|
@app.post("/api/files")
|
||||||
|
async def postFile(file: UploadFile):
|
||||||
|
if file.filename.endswith("csv"):
|
||||||
|
return {"error" : "Cannot accept csv files"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
contents = await file.read()
|
||||||
|
text = contents.decode("utf-8", errors="ignore")
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": f"Could not read file: {e}"}
|
||||||
|
|
||||||
|
# Skip empty uploads
|
||||||
|
if not text.strip():
|
||||||
|
return {"error": "File is empty or unreadable"}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", newline="", encoding="utf-8") as csvfile:
|
||||||
print("Processing files...")
|
print("Processing files...")
|
||||||
with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
|
temp_path = Path(csvfile.name)
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
|
|
||||||
file_count = 0
|
row = {"filename" : file.filename}
|
||||||
for path in ROOT.rglob("*"):
|
|
||||||
if not path.is_file():
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip binary files
|
|
||||||
if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
text = path.read_text(encoding="utf-8", errors="ignore")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Error reading {path.name}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not text.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
file_count += 1
|
|
||||||
if file_count % 10 == 0:
|
|
||||||
print(f"Processed {file_count} files...")
|
|
||||||
|
|
||||||
# Initialize row
|
|
||||||
row = {"filename": str(path.relative_to(ROOT.parent))}
|
|
||||||
|
|
||||||
# Extract metadata
|
# Extract metadata
|
||||||
metadata = extract_metadata(text, path.name)
|
metadata = extract_metadata(text, row["filename"])
|
||||||
row.update(metadata)
|
row.update(metadata)
|
||||||
|
|
||||||
# Detect content type
|
# Detect content type
|
||||||
row["content_type"] = detect_content_type(text)
|
row["content_type"] = detect_content_type(text)
|
||||||
row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
|
row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
|
||||||
|
|
||||||
# Extract entities with regex
|
|
||||||
entities = find_entities(text)
|
|
||||||
for key, values in entities.items():
|
|
||||||
row[key] = "; ".join(values) if values else ""
|
|
||||||
|
|
||||||
# Fill in missing pattern fields
|
# Fill in missing pattern fields
|
||||||
for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
|
for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
|
||||||
if pattern_key not in row:
|
if pattern_key not in row:
|
||||||
row[pattern_key] = ""
|
row[pattern_key] = ""
|
||||||
|
|
||||||
# Extract named entities with spaCy
|
|
||||||
if USE_SPACY:
|
|
||||||
persons, orgs, locs = extract_with_spacy(text)
|
persons, orgs, locs = extract_with_spacy(text)
|
||||||
row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()])
|
regex_entities = find_regex_entities(text)
|
||||||
row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()])
|
row.update(regex_entities)
|
||||||
row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()])
|
row["persons"] = "; ".join([f"{k} ({v['count']})" for k, v in persons.items()])
|
||||||
else:
|
row["organizations"] = "; ".join([f"{k} ({v['count']})" for k, v in orgs.items()])
|
||||||
row["persons"] = ""
|
row["locations"] = "; ".join([f"{k} ({v['count']})" for k, v in locs.items()])
|
||||||
row["organizations"] = ""
|
|
||||||
row["locations"] = ""
|
|
||||||
|
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
print(pprint.pprint(row))
|
||||||
|
|
||||||
print(f"\nComplete! Processed {file_count} files.")
|
return FileResponse(
|
||||||
print(f"Output: discovery_dataset.csv")
|
temp_path, media_type="text/csv", filename="dataset.csv"
|
||||||
|
)
|
||||||
|
|
||||||
# Print summary statistics
|
if __name__ == '__main__':
|
||||||
if file_count > 0:
|
uvicorn.run(app)
|
||||||
print("\nTo install spaCy for better entity extraction:")
|
|
||||||
print(" pip install spacy")
|
|
||||||
print(" python -m spacy download en_core_web_sm")
|
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
"use client";
|
"use client";
|
||||||
import { TryTab } from "./Sidebar";
|
import { TryTab } from "./Sidebar";
|
||||||
import { useState, useRef, useCallback, useEffect } from "react";
|
import { useState, useRef, useCallback, useEffect } from "react";
|
||||||
import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/idb";
|
|
||||||
|
|
||||||
interface CenterPanelProps {
|
interface CenterPanelProps {
|
||||||
tab: TryTab;
|
tab: TryTab;
|
||||||
|
|||||||
9
frontend/nordic-privacy-ai/package-lock.json
generated
9
frontend/nordic-privacy-ai/package-lock.json
generated
@@ -1001,6 +1001,7 @@
|
|||||||
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
|
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"bin": {
|
"bin": {
|
||||||
"acorn": "bin/acorn"
|
"acorn": "bin/acorn"
|
||||||
},
|
},
|
||||||
@@ -1441,6 +1442,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"baseline-browser-mapping": "^2.8.19",
|
"baseline-browser-mapping": "^2.8.19",
|
||||||
"caniuse-lite": "^1.0.30001751",
|
"caniuse-lite": "^1.0.30001751",
|
||||||
@@ -2103,6 +2105,7 @@
|
|||||||
"deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.",
|
"deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@eslint-community/eslint-utils": "^4.2.0",
|
"@eslint-community/eslint-utils": "^4.2.0",
|
||||||
"@eslint-community/regexpp": "^4.6.1",
|
"@eslint-community/regexpp": "^4.6.1",
|
||||||
@@ -2271,6 +2274,7 @@
|
|||||||
"integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
|
"integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@rtsao/scc": "^1.1.0",
|
"@rtsao/scc": "^1.1.0",
|
||||||
"array-includes": "^3.1.9",
|
"array-includes": "^3.1.9",
|
||||||
@@ -4423,6 +4427,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"nanoid": "^3.3.7",
|
"nanoid": "^3.3.7",
|
||||||
"picocolors": "^1.0.0",
|
"picocolors": "^1.0.0",
|
||||||
@@ -4581,6 +4586,7 @@
|
|||||||
"resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz",
|
||||||
"integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
|
"integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"loose-envify": "^1.1.0"
|
"loose-envify": "^1.1.0"
|
||||||
},
|
},
|
||||||
@@ -4593,6 +4599,7 @@
|
|||||||
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz",
|
||||||
"integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==",
|
"integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"loose-envify": "^1.1.0",
|
"loose-envify": "^1.1.0",
|
||||||
"scheduler": "^0.23.0"
|
"scheduler": "^0.23.0"
|
||||||
@@ -5539,6 +5546,7 @@
|
|||||||
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=12"
|
"node": ">=12"
|
||||||
},
|
},
|
||||||
@@ -5708,6 +5716,7 @@
|
|||||||
"integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
|
"integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
|
"peer": true,
|
||||||
"bin": {
|
"bin": {
|
||||||
"tsc": "bin/tsc",
|
"tsc": "bin/tsc",
|
||||||
"tsserver": "bin/tsserver"
|
"tsserver": "bin/tsserver"
|
||||||
|
|||||||
Reference in New Issue
Block a user