| |
| |
| |
| |
| |
|
|
| import re, os, hashlib |
| from datetime import datetime |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
| import gradio as gr |
|
|
| try: |
| import httpx |
| HAS_HTTPX = True |
| except ImportError: |
| HAS_HTTPX = False |
|
|
| try: |
| import chromadb |
| HAS_CHROMADB = True |
| except ImportError: |
| HAS_CHROMADB = False |
|
|
| try: |
| import numpy as np |
| HAS_NUMPY = True |
| except ImportError: |
| HAS_NUMPY = False |
|
|
| try: |
| from google import genai |
| from google.genai import types as gtypes |
| HAS_GENAI = True |
| except ImportError: |
| HAS_GENAI = False |
|
|
| GEMINI_KEY = os.getenv("GEMINI_API_KEY", "") |
| BRAVE_KEY = os.getenv("BRAVE_API_KEY", "") |
|
|
|
|
| |
| |
| |
|
|
| def _split_sentences(text): |
| """문장 분리 (경량 regex 버전)""" |
| text = re.sub(r'\s+', ' ', text).strip() |
| sents = re.split(r'(?<=[.!?。])\s+', text) |
| return [s.strip() for s in sents if s.strip()] |
|
|
|
|
| def _split_words(text): |
| """단어 분리""" |
| return [w for w in re.findall(r'[가-힣a-zA-Z0-9]+', text) if w] |
|
|
|
|
| def _http_get(url, headers=None, timeout=15): |
| """HTTP GET""" |
| try: |
| if HAS_HTTPX: |
| r = httpx.get(url, headers=headers or {}, timeout=timeout, follow_redirects=True) |
| return r.text if r.status_code == 200 else None |
| else: |
| import urllib.request |
| req = urllib.request.Request(url, headers=headers or {}) |
| with urllib.request.urlopen(req, timeout=timeout) as resp: |
| return resp.read().decode('utf-8', errors='replace') |
| except: |
| return None |
|
|
|
|
| |
| |
| |
|
|
| def brave_search(query, count=5): |
| """Brave Search API""" |
| if not BRAVE_KEY: |
| return [] |
| url = f"https://api.search.brave.com/res/v1/web/search?q={query}&count={count}" |
| try: |
| if HAS_HTTPX: |
| r = httpx.get(url, headers={"X-Subscription-Token": BRAVE_KEY, "Accept": "application/json"}, timeout=10) |
| if r.status_code == 200: |
| data = r.json() |
| results = [] |
| for item in data.get("web", {}).get("results", []): |
| results.append({ |
| "title": item.get("title", ""), |
| "url": item.get("url", ""), |
| "snippet": item.get("description", ""), |
| "source": "Brave", |
| }) |
| return results |
| except: |
| pass |
| return [] |
|
|
|
|
| def search_kci(query): |
| """KCI 학술논문 검색""" |
| try: |
| url = f"https://open.kci.go.kr/po/openapi/openApiSearch.kci?apiCode=articleSearch&title={query}&displayCount=3" |
| resp = _http_get(url, timeout=8) |
| if resp: |
| results = [] |
| for m in re.finditer( |
| r'<article-title><!\[CDATA\[(.+?)\]\]></article-title>.*?<url><!\[CDATA\[(.+?)\]\]></url>', |
| resp, re.S, |
| ): |
| results.append({"title": m.group(1), "url": m.group(2), "snippet": "", "source": "KCI"}) |
| return results[:3] |
| except: |
| pass |
| return [] |
|
|
|
|
| def search_riss(query): |
| """RISS 학술논문 검색""" |
| results = [] |
| try: |
| url = ( |
| f"http://www.riss.kr/search/Search.do?isDetailSearch=N&searchGubun=true" |
| f"&viewYn=OP&queryText=&strQuery={query}&iStartCount=0&iGroupView=5&icate=all" |
| ) |
| resp = _http_get(url, timeout=8) |
| if resp: |
| for m in re.finditer( |
| r'class="title"[^>]*>.*?<a[^>]*href="([^"]+)"[^>]*>(.*?)</a>', resp, re.S |
| ): |
| title = re.sub(r'<[^>]+>', '', m.group(2)).strip() |
| if title: |
| results.append({ |
| "title": title, |
| "url": "https://www.riss.kr" + m.group(1), |
| "snippet": "", |
| "source": "RISS", |
| }) |
| except: |
| pass |
| return results[:3] |
|
|
|
|
| def search_arxiv(query): |
| """arXiv 논문 검색""" |
| results = [] |
| try: |
| import urllib.parse |
| q = urllib.parse.quote(query) |
| url = f"https://export.arxiv.org/api/query?search_query=all:{q}&start=0&max_results=3&sortBy=relevance" |
| resp = _http_get(url, timeout=12) |
| if resp: |
| for m in re.finditer( |
| r'<entry>.*?<title>(.*?)</title>.*?<id>(.*?)</id>.*?<summary>(.*?)</summary>', |
| resp, re.S, |
| ): |
| title = re.sub(r'\s+', ' ', m.group(1)).strip() |
| results.append({ |
| "title": title, |
| "url": m.group(2).strip(), |
| "snippet": re.sub(r'\s+', ' ', m.group(3)).strip()[:150], |
| "source": "arXiv", |
| }) |
| except: |
| pass |
| return results[:3] |
|
|
|
|
| def parallel_brave_search(queries, max_workers=10): |
| """Brave Search 병렬 실행""" |
| all_results = {} |
| with ThreadPoolExecutor(max_workers=min(max_workers, 20)) as executor: |
| futures = {executor.submit(brave_search, q, 3): q for q in queries} |
| for future in as_completed(futures): |
| q = futures[future] |
| try: |
| all_results[q] = future.result() |
| except: |
| all_results[q] = [] |
| return all_results |
|
|
|
|
| def duckduckgo_search(query, max_results=5): |
| """DuckDuckGo HTML 스크래핑 — API 키 불필요 폴백""" |
| results = [] |
| try: |
| import urllib.parse |
| q = urllib.parse.quote(query) |
| url = f"https://html.duckduckgo.com/html/?q={q}" |
| headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} |
| resp = _http_get(url, headers=headers, timeout=10) |
| if resp: |
| for m in re.finditer( |
| r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>(.*?)</a>' |
| r'.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>', |
| resp, re.S, |
| ): |
| href = m.group(1) |
| title = re.sub(r'<[^>]+>', '', m.group(2)).strip() |
| snippet = re.sub(r'<[^>]+>', '', m.group(3)).strip() |
| real_url = href |
| if 'uddg=' in href: |
| um = re.search(r'uddg=([^&]+)', href) |
| if um: |
| real_url = urllib.parse.unquote(um.group(1)) |
| if title: |
| results.append({"title": title, "url": real_url, "snippet": snippet, "source": "Web"}) |
| if len(results) >= max_results: |
| break |
| except: |
| pass |
| return results |
|
|
|
|
| def self_crawl_search(query, max_results=3): |
| """DuckDuckGo 크롤링 + 학술 보충""" |
| all_results = [] |
| all_results.extend(duckduckgo_search(query, max_results)) |
| if '논문' not in query and 'paper' not in query.lower(): |
| all_results.extend(duckduckgo_search(f"{query} 논문 학술", 2)) |
| return all_results |
|
|
|
|
| |
| |
| |
|
|
| def gemini_plagiarism_check(text_chunk): |
| """Gemini + Google Search Grounding으로 표절 검사""" |
| if not HAS_GENAI or not GEMINI_KEY: |
| return None |
| try: |
| client = genai.Client(api_key=GEMINI_KEY) |
| tool = gtypes.Tool(googleSearch=gtypes.GoogleSearch()) |
| prompt = f"""다음 텍스트가 인터넷에 존재하는지 Google Search로 확인하세요. |
| 유사한 문장이 발견되면 출처 URL과 유사도(%)를 보고하세요. |
| 마지막 줄에 "유사도: XX%" 형식으로 작성. |
| [텍스트] |
| {text_chunk[:1000]}""" |
| resp = client.models.generate_content( |
| model="gemini-flash-lite-latest", |
| contents=prompt, |
| config=gtypes.GenerateContentConfig(tools=[tool], temperature=0.1, max_output_tokens=600), |
| ) |
| text_resp = resp.text if resp.text else "" |
| sources = [] |
| if hasattr(resp, 'candidates') and resp.candidates: |
| gc = resp.candidates[0].grounding_metadata |
| if gc and hasattr(gc, 'grounding_chunks'): |
| for chunk in gc.grounding_chunks: |
| if hasattr(chunk, 'web') and chunk.web: |
| sources.append({ |
| "title": chunk.web.title or "", |
| "url": chunk.web.uri or "", |
| "source": "Google", |
| }) |
| pm = re.search(r'유사도[:\s]*(\d+)', text_resp) |
| pct = int(pm.group(1)) if pm else 0 |
| return {"pct": pct, "response": text_resp, "sources": sources} |
| except Exception as e: |
| return {"pct": 0, "response": str(e)[:100], "sources": []} |
|
|
|
|
| |
| |
| |
|
|
| def _ngram_set(text, n=3): |
| """텍스트에서 n-gram 집합 생성""" |
| text = re.sub(r'\s+', ' ', text.lower().strip()) |
| words = re.findall(r'[가-힣a-zA-Z0-9]+', text) |
| if len(words) < n: |
| return set(words) |
| return {' '.join(words[i:i + n]) for i in range(len(words) - n + 1)} |
|
|
|
|
| def _text_similarity(text_a, text_b, n=3): |
| """두 텍스트 간 n-gram Jaccard 유사도 (0.0~1.0)""" |
| if not text_a or not text_b: |
| return 0.0 |
| set_a = _ngram_set(text_a, n) |
| set_b = _ngram_set(text_b, n) |
| if not set_a or not set_b: |
| return 0.0 |
| intersection = set_a & set_b |
| union = set_a | set_b |
| return len(intersection) / len(union) if union else 0.0 |
|
|
|
|
| def _containment_similarity(original, source_text, n=3): |
| """원문 n-gram이 출처에 얼마나 포함되는지 (Containment, 0.0~1.0)""" |
| if not original or not source_text: |
| return 0.0 |
| set_orig = _ngram_set(original, n) |
| set_src = _ngram_set(source_text, n) |
| if not set_orig: |
| return 0.0 |
| overlap = set_orig & set_src |
| return len(overlap) / len(set_orig) |
|
|
|
|
| def _fetch_page_text(url, timeout=8): |
| """URL에서 본문 텍스트 추출 (경량 크롤링)""" |
| if not HAS_HTTPX: |
| return "" |
| try: |
| headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} |
| r = httpx.get(url, headers=headers, timeout=timeout, follow_redirects=True) |
| if r.status_code != 200: |
| return "" |
| html = r.text |
| html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S | re.I) |
| html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S | re.I) |
| text = re.sub(r'<[^>]+>', ' ', html) |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text[:5000] |
| except Exception: |
| return "" |
|
|
|
|
| def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_workers=8): |
| """출처 URL을 크롤링하여 원문과 대조 검증 — 유사도 미달 출처 제거, 크롤링 본문도 반환""" |
| if not sources: |
| return [], 0.0, {} |
|
|
| verified = [] |
| total_sim = 0.0 |
| crawled_texts = {} |
|
|
| def _check_one(src): |
| url = src.get("url", "") |
| snippet = src.get("snippet", "") |
| page_text = "" |
|
|
| |
| page_text = _fetch_page_text(url, timeout=6) |
|
|
| if snippet and len(snippet) > 20: |
| sim = _text_similarity(original_text[:500], snippet, n=2) |
| if sim >= min_similarity: |
| return {**src, "similarity": round(sim * 100, 1), "method": "snippet"}, sim, page_text |
|
|
| if page_text and len(page_text) > 50: |
| jaccard = _text_similarity(original_text, page_text, n=3) |
| containment = _containment_similarity(original_text, page_text, n=3) |
| sim = max(jaccard, containment) |
| if sim >= min_similarity: |
| matched = _find_matching_sentences(original_text, page_text) |
| return {**src, "similarity": round(sim * 100, 1), "method": "crawl", "matched": matched}, sim, page_text |
|
|
| return None, 0.0, page_text |
|
|
| with ThreadPoolExecutor(max_workers=min(max_workers, len(sources))) as executor: |
| futures = {executor.submit(_check_one, src): src for src in sources[:20]} |
| for future in as_completed(futures, timeout=30): |
| try: |
| result, sim, page_text = future.result() |
| src_info = futures[future] |
| url = src_info.get("url", "") |
| |
| if page_text and len(page_text) > 50 and url: |
| crawled_texts[url] = { |
| "text": page_text, |
| "title": src_info.get("title", ""), |
| "source": src_info.get("source", ""), |
| } |
| if result is not None: |
| verified.append(result) |
| total_sim += sim |
| except Exception: |
| pass |
|
|
| verified.sort(key=lambda x: x.get("similarity", 0), reverse=True) |
| avg_sim = total_sim / len(verified) if verified else 0.0 |
| return verified, avg_sim, crawled_texts |
|
|
|
|
| def _find_matching_sentences(original, source_text, threshold=0.3): |
| """원문에서 출처와 일치하는 문장 찾기""" |
| orig_sents = _split_sentences(original) |
| matched = [] |
| for sent in orig_sents[:20]: |
| if len(sent) < 10: |
| continue |
| sim = _text_similarity(sent, source_text, n=2) |
| if sim >= threshold: |
| matched.append(sent[:60]) |
| if len(matched) >= 3: |
| break |
| return matched |
|
|
|
|
| |
| |
| |
|
|
| def _chunk_text(text, chunk_size=200, overlap=50): |
| """텍스트를 청크로 분리 (문장 경계 유지)""" |
| sents = _split_sentences(text) |
| chunks = [] |
| current = [] |
| current_len = 0 |
| for sent in sents: |
| current.append(sent) |
| current_len += len(sent) |
| if current_len >= chunk_size: |
| chunks.append(' '.join(current)) |
| |
| if overlap > 0 and len(current) > 1: |
| current = current[-1:] |
| current_len = len(current[0]) |
| else: |
| current = [] |
| current_len = 0 |
| if current: |
| chunks.append(' '.join(current)) |
| return chunks |
|
|
|
|
| class VectorPlagiarismDB: |
| """크롤링한 출처를 벡터DB에 저장하고 문장별 유사도 검색""" |
|
|
| def __init__(self): |
| if not HAS_CHROMADB: |
| self.available = False |
| return |
| try: |
| self.client = chromadb.Client() |
| self.collection = self.client.get_or_create_collection( |
| name="plag_sources", |
| metadata={"hnsw:space": "cosine"}, |
| ) |
| self.available = True |
| self._doc_count = 0 |
| except Exception as e: |
| print(f"ChromaDB 초기화 실패: {e}") |
| self.available = False |
|
|
| def index_source(self, url, title, text, source_type="Web"): |
| """출처 텍스트를 청크로 분리하여 벡터DB에 저장""" |
| if not self.available or not text or len(text.strip()) < 30: |
| return 0 |
| try: |
| chunks = _chunk_text(text, chunk_size=200, overlap=50) |
| if not chunks: |
| return 0 |
|
|
| doc_hash = hashlib.md5(url.encode()).hexdigest()[:10] |
| ids = [] |
| documents = [] |
| metadatas = [] |
|
|
| for i, chunk in enumerate(chunks[:50]): |
| chunk_id = f"{doc_hash}_{i}" |
| ids.append(chunk_id) |
| documents.append(chunk) |
| metadatas.append({ |
| "url": url[:200], |
| "title": title[:100], |
| "source": source_type, |
| "chunk_idx": i, |
| }) |
|
|
| self.collection.add(ids=ids, documents=documents, metadatas=metadatas) |
| self._doc_count += len(ids) |
| return len(ids) |
| except Exception as e: |
| print(f"벡터DB 인덱싱 오류: {e}") |
| return 0 |
|
|
| def query_sentence(self, sentence, n_results=3): |
| """단일 문장에 대해 가장 유사한 출처 청크 검색""" |
| if not self.available or self._doc_count == 0: |
| return [] |
| try: |
| results = self.collection.query( |
| query_texts=[sentence], |
| n_results=min(n_results, self._doc_count), |
| ) |
| matches = [] |
| if results and results['distances'] and results['distances'][0]: |
| for j, dist in enumerate(results['distances'][0]): |
| |
| |
| similarity = max(0, 1 - dist / 2) |
| if similarity < 0.3: |
| continue |
| meta = results['metadatas'][0][j] if results['metadatas'] else {} |
| doc_text = results['documents'][0][j] if results['documents'] else "" |
| matches.append({ |
| "similarity": round(similarity * 100, 1), |
| "url": meta.get("url", ""), |
| "title": meta.get("title", ""), |
| "source": meta.get("source", ""), |
| "matched_text": doc_text[:150], |
| }) |
| return matches |
| except Exception as e: |
| print(f"벡터DB 쿼리 오류: {e}") |
| return [] |
|
|
| def check_document(self, text, min_similarity=30): |
| """전체 문서를 문장별로 검사 → 표절 문장 목록 반환""" |
| if not self.available or self._doc_count == 0: |
| return [], 0.0, {} |
|
|
| sents = _split_sentences(text) |
| flagged = [] |
| all_sims = [] |
| source_hits = {} |
|
|
| for sent in sents: |
| if len(sent) < 15: |
| continue |
| matches = self.query_sentence(sent, n_results=3) |
| if not matches: |
| all_sims.append(0) |
| continue |
|
|
| best = matches[0] |
| all_sims.append(best["similarity"]) |
|
|
| if best["similarity"] >= min_similarity: |
| flagged.append({ |
| "sentence": sent[:80], |
| "similarity": best["similarity"], |
| "url": best["url"], |
| "title": best["title"], |
| "matched_text": best["matched_text"][:100], |
| }) |
| url = best["url"] |
| source_hits[url] = source_hits.get(url, 0) + 1 |
|
|
| |
| total_checked = len(all_sims) |
| flagged_count = len(flagged) |
| plag_ratio = (flagged_count / total_checked * 100) if total_checked > 0 else 0 |
|
|
| |
| avg_sim = sum(s["similarity"] for s in flagged) / len(flagged) if flagged else 0 |
|
|
| return flagged, plag_ratio, source_hits |
|
|
| def get_stats(self): |
| """DB 상태""" |
| if not self.available: |
| return {"available": False, "chunks": 0} |
| return {"available": True, "chunks": self._doc_count} |
|
|
|
|
| |
| |
| |
|
|
| def _extract_key_phrases(text, max_phrases=6): |
| """텍스트에서 핵심 검색 구문 추출 (Brave/학술 검색용)""" |
| sents = _split_sentences(text) |
| phrases = [] |
| ranked = sorted(sents, key=lambda s: len(s), reverse=True) |
| for s in ranked: |
| if 15 <= len(s) <= 80: |
| phrases.append(s) |
| elif len(s) > 80: |
| phrases.append(s[:80]) |
| if len(phrases) >= max_phrases: |
| break |
| if len(phrases) < 2 and sents: |
| phrases.append(sents[0][:80]) |
| return phrases |
|
|
|
|
| |
| |
| |
|
|
| def run_plagiarism(text, progress=gr.Progress()): |
| """표절 검사 — 3단계 파이프라인: ① Gemini Google Search ② Brave 웹검색 ③ 학술DB""" |
| if not text or len(text.strip()) < 50: |
| return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자 이상</div>", "" |
|
|
| text = text.strip() |
| now = datetime.now().strftime("%Y-%m-%d %H:%M") |
| doc_id = hashlib.md5(text[:100].encode()).hexdigest()[:8].upper() |
| log_lines = [] |
|
|
| |
| def _prog(val, msg=""): |
| try: |
| progress(val, msg) |
| except: |
| pass |
|
|
| gemini_pct = 0 |
| gemini_sources = [] |
| gemini_response = "" |
| brave_sources = [] |
| academic_sources = [] |
|
|
| |
| |
| |
| if HAS_GENAI and GEMINI_KEY: |
| _prog(0.10, "① Gemini Google Search Grounding...") |
| try: |
| client = genai.Client(api_key=GEMINI_KEY) |
|
|
| prompt = f"""당신은 표절 검사 전문가입니다. 아래 텍스트가 인터넷에 이미 존재하는 내용인지 Google Search로 철저히 검색하세요. |
| |
| [검사 대상 텍스트] |
| {text[:3000]} |
| |
| [응답 형식] |
| 1. 발견된 유사 콘텐츠를 각각 "제목 | URL | 유사도(높음/중간/낮음)" 형식으로 나열 |
| 2. 발견 못하면 "유사 콘텐츠 없음" |
| 3. 마지막 줄에 반드시 "표절율: XX%" 형식으로 종합 판정""" |
|
|
| contents = [ |
| gtypes.Content( |
| role="user", |
| parts=[gtypes.Part.from_text(text=prompt)], |
| ) |
| ] |
| tools = [gtypes.Tool(googleSearch=gtypes.GoogleSearch())] |
| generate_content_config = gtypes.GenerateContentConfig( |
| thinking_config=gtypes.ThinkingConfig(thinking_budget=0), |
| tools=tools, |
| temperature=0.1, |
| max_output_tokens=4000, |
| ) |
|
|
| _prog(0.20, "① Google Search 실행 중...") |
|
|
| full_response = "" |
| for chunk in client.models.generate_content_stream( |
| model="gemini-flash-lite-latest", |
| contents=contents, |
| config=generate_content_config, |
| ): |
| if chunk.text: |
| full_response += chunk.text |
| gemini_response = full_response |
|
|
| |
| pm = re.search(r'표절율[:\s]*(\d+)', full_response) |
| if pm: |
| gemini_pct = int(pm.group(1)) |
|
|
| |
| _prog(0.28, "① 출처 메타데이터 추출...") |
| try: |
| resp_full = client.models.generate_content( |
| model="gemini-flash-lite-latest", |
| contents=prompt, |
| config=gtypes.GenerateContentConfig( |
| tools=[gtypes.Tool(googleSearch=gtypes.GoogleSearch())], |
| temperature=0.1, |
| max_output_tokens=2000, |
| ), |
| ) |
| if hasattr(resp_full, 'candidates') and resp_full.candidates: |
| cand = resp_full.candidates[0] |
| gm = getattr(cand, 'grounding_metadata', None) |
| if gm: |
| chunks = getattr(gm, 'grounding_chunks', None) or [] |
| for gc in chunks: |
| web = getattr(gc, 'web', None) |
| if web: |
| title = getattr(web, 'title', '') or '' |
| uri = getattr(web, 'uri', '') or '' |
| if uri: |
| gemini_sources.append({ |
| "title": title, |
| "url": uri, |
| "source": "Google", |
| "snippet": "", |
| }) |
| supports = getattr(gm, 'grounding_supports', None) or [] |
| for sup in supports: |
| seg = getattr(sup, 'segment', None) |
| snippet_text = getattr(seg, 'text', '') if seg else '' |
| idxs = getattr(sup, 'grounding_chunk_indices', []) or [] |
| for idx in idxs: |
| if idx < len(gemini_sources) and snippet_text: |
| gemini_sources[idx]["snippet"] = snippet_text[:120] |
| except Exception as e2: |
| print(f"Gemini 메타데이터 추출 오류: {e2}") |
|
|
| |
| existing_urls = {s["url"] for s in gemini_sources} |
| for m in re.finditer(r'https?://[^\s\)\]\,\"\']{10,}', full_response): |
| url = m.group(0).rstrip('.') |
| if url not in existing_urls: |
| domain = url.split('/')[2] if len(url.split('/')) > 2 else url |
| gemini_sources.append({"title": domain, "url": url, "source": "Google", "snippet": ""}) |
| existing_urls.add(url) |
|
|
| log_lines.append(f"[Gemini] 표절율={gemini_pct}%, 출처={len(gemini_sources)}건") |
|
|
| except Exception as e: |
| log_lines.append(f"[Gemini] 오류: {str(e)[:100]}") |
| print(f"Gemini 오류: {str(e)}") |
| else: |
| log_lines.append("[Gemini] API 키 없음 — 건너뜀") |
|
|
| |
| |
| |
| _prog(0.40, "② Brave Search 웹 검색...") |
| key_phrases = _extract_key_phrases(text, max_phrases=6) |
|
|
| if BRAVE_KEY and key_phrases: |
| try: |
| brave_results = parallel_brave_search(key_phrases, max_workers=10) |
| seen_urls = {s["url"] for s in gemini_sources} |
| for query, results in brave_results.items(): |
| for r in results: |
| url = r.get("url", "") |
| if url and url not in seen_urls: |
| brave_sources.append({ |
| "title": r.get("title", "")[:80], |
| "url": url, |
| "source": "Brave", |
| "snippet": r.get("snippet", "")[:120], |
| }) |
| seen_urls.add(url) |
| log_lines.append(f"[Brave] 쿼리={len(key_phrases)}개, 출처={len(brave_sources)}건") |
| except Exception as e: |
| log_lines.append(f"[Brave] 오류: {str(e)[:80]}") |
| elif not BRAVE_KEY: |
| try: |
| seen_urls = {s["url"] for s in gemini_sources} |
| for phrase in key_phrases[:3]: |
| for r in duckduckgo_search(phrase, max_results=3): |
| url = r.get("url", "") |
| if url and url not in seen_urls: |
| brave_sources.append({ |
| "title": r.get("title", "")[:80], |
| "url": url, |
| "source": "Web", |
| "snippet": r.get("snippet", "")[:120], |
| }) |
| seen_urls.add(url) |
| log_lines.append(f"[DuckDuckGo] 폴백, 출처={len(brave_sources)}건") |
| except Exception as e: |
| log_lines.append(f"[DuckDuckGo] 오류: {str(e)[:80]}") |
|
|
| |
| |
| |
| _prog(0.60, "③ 학술 DB 검색 (KCI·RISS·arXiv)...") |
|
|
| academic_query = text[:100].replace('\n', ' ') |
| has_korean = bool(re.search(r'[가-힣]', text)) |
|
|
| try: |
| with ThreadPoolExecutor(max_workers=5) as executor: |
| futures = {} |
| futures[executor.submit(search_arxiv, academic_query[:60])] = "arXiv" |
| if has_korean: |
| futures[executor.submit(search_kci, academic_query[:40])] = "KCI" |
| futures[executor.submit(search_riss, academic_query[:40])] = "RISS" |
|
|
| seen_urls = {s["url"] for s in gemini_sources + brave_sources} |
| for future in as_completed(futures, timeout=15): |
| src_name = futures[future] |
| try: |
| results = future.result() |
| for r in results: |
| url = r.get("url", "") |
| if url and url not in seen_urls: |
| academic_sources.append({ |
| "title": r.get("title", "")[:80], |
| "url": url, |
| "source": src_name, |
| "snippet": r.get("snippet", "")[:120], |
| }) |
| seen_urls.add(url) |
| except Exception: |
| pass |
| log_lines.append(f"[학술] KCI·RISS·arXiv 출처={len(academic_sources)}건") |
| except Exception as e: |
| log_lines.append(f"[학술] 오류: {str(e)[:80]}") |
|
|
| |
| |
| |
| _prog(0.75, "④ 출처 검증 (크롤링 대조)...") |
|
|
| raw_sources = gemini_sources + brave_sources + academic_sources |
| log_lines.append(f"[수집] 총 {len(raw_sources)}건 (검증 전)") |
|
|
| |
| verified_sources, avg_similarity, crawled_texts = _verify_sources_parallel( |
| text, raw_sources, min_similarity=0.02, max_workers=8 |
| ) |
|
|
| |
| verified_urls = {s["url"] for s in verified_sources} |
| unverified_sources = [s for s in raw_sources if s["url"] not in verified_urls] |
|
|
| log_lines.append(f"[검증] 통과={len(verified_sources)}건, 미검증={len(unverified_sources)}건, 크롤링={len(crawled_texts)}건, 평균유사도={avg_similarity:.3f}") |
|
|
| |
| |
| |
| vector_flagged = [] |
| vector_plag_ratio = 0.0 |
| vector_source_hits = {} |
| vdb = None |
|
|
| if HAS_CHROMADB and crawled_texts: |
| _prog(0.80, "⑤ 벡터DB 문장별 정밀 대조...") |
| try: |
| vdb = VectorPlagiarismDB() |
| if vdb.available: |
| |
| indexed_count = 0 |
| for url, info in crawled_texts.items(): |
| n = vdb.index_source( |
| url=url, |
| title=info.get("title", ""), |
| text=info["text"], |
| source_type=info.get("source", "Web"), |
| ) |
| indexed_count += n |
|
|
| log_lines.append(f"[벡터DB] {len(crawled_texts)}개 출처 → {indexed_count}개 청크 인덱싱") |
|
|
| if indexed_count > 0: |
| |
| vector_flagged, vector_plag_ratio, vector_source_hits = vdb.check_document( |
| text, min_similarity=35 |
| ) |
| log_lines.append( |
| f"[벡터DB] 표절 문장={len(vector_flagged)}건, " |
| f"문장표절율={vector_plag_ratio:.1f}%, " |
| f"히트출처={len(vector_source_hits)}건" |
| ) |
| except Exception as e: |
| log_lines.append(f"[벡터DB] 오류: {str(e)[:80]}") |
| elif not HAS_CHROMADB: |
| log_lines.append("[벡터DB] chromadb 미설치 — 건너뜀") |
| else: |
| log_lines.append("[벡터DB] 크롤링 데이터 없음 — 건너뜀") |
|
|
| |
| |
| |
| _prog(0.90, "보고서 생성...") |
|
|
| all_sources = verified_sources |
|
|
| |
| |
| max_sim = max((s.get("similarity", 0) for s in verified_sources), default=0) |
| count_factor = min(len(verified_sources) * 3, 30) |
| avg_factor = avg_similarity * 100 |
| ngram_score = min(round(max_sim * 0.4 + avg_factor * 0.3 + count_factor * 0.3), 100) |
|
|
| |
| vector_score = round(vector_plag_ratio) if vector_flagged else 0 |
|
|
| |
| if vector_flagged: |
| combined_score = round(ngram_score * 0.4 + vector_score * 0.6) |
| else: |
| combined_score = ngram_score |
|
|
| |
| if gemini_pct > 0 and len(verified_sources) > 0: |
| plag_pct = min(round(gemini_pct * 0.2 + combined_score * 0.8), 100) |
| elif len(verified_sources) > 0: |
| plag_pct = combined_score |
| else: |
| plag_pct = min(round(gemini_pct * 0.1), 20) |
|
|
| log_lines.append( |
| f"[판정] Gemini={gemini_pct}%, n-gram={ngram_score}%, " |
| f"벡터={vector_score}%, → 종합={plag_pct}%" |
| ) |
|
|
| if plag_pct >= 50: |
| grade, gc = "🚨 표절 의심", "#FF4444" |
| elif plag_pct >= 30: |
| grade, gc = "⚠️ 주의 필요", "#FF8800" |
| elif plag_pct >= 15: |
| grade, gc = "📌 유사표현", "#DDAA00" |
| elif plag_pct >= 5: |
| grade, gc = "✓ 양호", "#4ECDC4" |
| else: |
| grade, gc = "✅ 우수", "#22AA44" |
|
|
| word_count = len(_split_words(text)) |
| char_count = len(text) |
|
|
| |
| |
| |
| def _source_badge(src): |
| colors = { |
| "Google": "#4285F4", "Brave": "#FB542B", "Web": "#888", |
| "KCI": "#2E7D32", "RISS": "#1565C0", "arXiv": "#B71C1C", |
| } |
| c = colors.get(src, "#666") |
| return f'<span style="display:inline-block;padding:2px 6px;border-radius:3px;background:{c};color:#fff;font-size:9px;font-weight:700;">{src}</span>' |
|
|
| src_rows = "" |
| for i, s in enumerate(all_sources[:30]): |
| title_display = s['title'][:55] if s['title'] else ( |
| s['url'].split('/')[2] if len(s['url'].split('/')) > 2 else s['url'][:40] |
| ) |
| sim_val = s.get("similarity", 0) |
| sim_color = "#FF4444" if sim_val >= 30 else "#FF8800" if sim_val >= 10 else "#DDAA00" if sim_val >= 3 else "#999" |
| sim_html = f'<span style="font-weight:700;color:{sim_color};">{sim_val:.1f}%</span>' |
| method = s.get("method", "") |
| method_tag = f' <span style="font-size:8px;color:#aaa;">({method})</span>' if method else "" |
| snippet_html = ( |
| f'<div style="font-size:9px;color:#888;margin-top:2px;">{s["snippet"][:100]}</div>' |
| if s.get("snippet") else "" |
| ) |
| matched = s.get("matched", []) |
| matched_html = "" |
| if matched: |
| matched_html = '<div style="font-size:9px;color:#C62828;margin-top:2px;">매칭: ' + " | ".join(matched[:2]) + '</div>' |
| src_rows += f"""<tr style="border-bottom:1px solid #E8E8E8;"> |
| <td style="padding:8px;text-align:center;font-size:11px;color:#666;">{i + 1}</td> |
| <td style="padding:8px;">{_source_badge(s.get('source', ''))}</td> |
| <td style="padding:8px;text-align:center;">{sim_html}{method_tag}</td> |
| <td style="padding:8px;"><a href="{s['url']}" target="_blank" rel="noopener noreferrer" style="color:#2E86C1;text-decoration:none;font-weight:600;font-size:11px;">{title_display}</a>{snippet_html}{matched_html}</td> |
| <td style="padding:8px;font-size:9px;color:#999;word-break:break-all;max-width:180px;"><a href="{s['url']}" target="_blank" rel="noopener noreferrer" style="color:#999;text-decoration:none;">{s['url'][:60]}</a></td> |
| </tr>""" |
|
|
| if not src_rows: |
| src_rows = '<tr><td colspan="5" style="padding:20px;text-align:center;color:#999;">검증된 유사 출처 없음 — 표절 가능성 낮음</td></tr>' |
|
|
| |
| gemini_summary = "" |
| if gemini_response: |
| safe_resp = gemini_response.replace('<', '<').replace('>', '>').replace('\n', '<br>') |
| gemini_summary = f""" |
| <div style="padding:16px 24px;border-bottom:1px solid #E0E0E0;"> |
| <details> |
| <summary style="cursor:pointer;font-size:13px;font-weight:700;color:#1A3C6E;">🤖 Gemini 분석 상세</summary> |
| <div style="margin-top:10px;padding:12px;background:#F8F9FA;border-radius:6px;font-size:11px;line-height:1.7;color:#333;max-height:300px;overflow-y:auto;">{safe_resp}</div> |
| </details> |
| </div>""" |
|
|
| |
| vector_section = "" |
| if vector_flagged: |
| vf_rows = "" |
| for k, vf in enumerate(vector_flagged[:15]): |
| sim_val = vf["similarity"] |
| sim_color = "#FF4444" if sim_val >= 70 else "#FF8800" if sim_val >= 50 else "#DDAA00" |
| sent_safe = vf["sentence"][:70].replace('<', '<') |
| matched_safe = vf["matched_text"][:90].replace('<', '<') |
| title_safe = vf["title"][:40].replace('<', '<') |
| vf_rows += f"""<tr style="border-bottom:1px solid #F0F0F0;"> |
| <td style="padding:6px;text-align:center;font-size:10px;color:#666;">{k+1}</td> |
| <td style="padding:6px;font-size:10px;color:#333;">{sent_safe}</td> |
| <td style="padding:6px;text-align:center;"><span style="font-weight:700;color:{sim_color};">{sim_val:.0f}%</span></td> |
| <td style="padding:6px;font-size:9px;color:#666;">{matched_safe}</td> |
| <td style="padding:6px;font-size:9px;"><a href="{vf['url']}" target="_blank" rel="noopener noreferrer" style="color:#2E86C1;text-decoration:none;">{title_safe}</a></td> |
| </tr>""" |
|
|
| vdb_stats = vdb.get_stats() if vdb else {"chunks": 0} |
| vector_section = f""" |
| <div style="padding:16px 24px;border-bottom:1px solid #E0E0E0;"> |
| <details open> |
| <summary style="cursor:pointer;font-size:13px;font-weight:700;color:#1A3C6E;"> |
| 🧬 벡터DB 문장별 정밀 대조 — 표절 문장 {len(vector_flagged)}건 / 문장표절율 {vector_plag_ratio:.1f}% |
| <span style="font-size:10px;font-weight:400;color:#888;margin-left:8px;">(DB: {vdb_stats['chunks']}청크)</span> |
| </summary> |
| <table style="width:100%;border-collapse:collapse;font-size:11px;margin-top:10px;"> |
| <thead> |
| <tr style="background:#E8EAF6;color:#333;"> |
| <th style="padding:8px;width:30px;">#</th> |
| <th style="padding:8px;text-align:left;">입력 문장</th> |
| <th style="padding:8px;width:55px;">유사도</th> |
| <th style="padding:8px;text-align:left;">매칭 출처 내용</th> |
| <th style="padding:8px;width:120px;">출처</th> |
| </tr> |
| </thead> |
| <tbody>{vf_rows}</tbody> |
| </table> |
| </details> |
| </div>""" |
|
|
| HDR = '#3B7DD8' |
| html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:900px;margin:20px auto;background:#fff;border:1px solid #E0E0E0;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,0.06);"> |
| <div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:24px;color:#fff;border-radius:8px 8px 0 0;"> |
| <div style="display:flex;justify-content:space-between;align-items:center;"> |
| <div> |
| <div style="font-size:24px;font-weight:900;">표절 검사 결과</div> |
| <div style="font-size:12px;opacity:0.9;margin-top:4px;">Gemini + Brave + KCI·RISS·arXiv + 벡터DB 정밀대조</div> |
| </div> |
| <div style="text-align:right;font-size:11px;opacity:0.9;"> |
| <div>문서: {doc_id}</div> |
| <div>{now}</div> |
| </div> |
| </div> |
| </div> |
| <div style="padding:24px;background:#FAFBFE;border-bottom:1px solid #E0E0E0;"> |
| <div style="display:grid;grid-template-columns:1fr 1fr 1fr 1fr;gap:12px;"> |
| <div style="text-align:center;padding:16px;background:#fff;border-radius:6px;border:1px solid #E0E0E0;"> |
| <div style="font-size:42px;font-weight:900;color:{gc};">{plag_pct}%</div> |
| <div style="font-size:11px;color:#666;margin-top:6px;">종합 표절율</div> |
| </div> |
| <div style="text-align:center;padding:16px;background:#fff;border-radius:6px;border:1px solid #E0E0E0;"> |
| <div style="font-size:22px;font-weight:900;color:{gc};margin-top:6px;">{grade}</div> |
| <div style="font-size:11px;color:#666;margin-top:6px;">판정</div> |
| </div> |
| <div style="text-align:center;padding:16px;background:#fff;border-radius:6px;border:1px solid #E0E0E0;"> |
| <div style="font-size:28px;font-weight:900;color:#555;">{len(all_sources)}</div> |
| <div style="font-size:11px;color:#666;margin-top:6px;">검증 출처</div> |
| </div> |
| <div style="text-align:center;padding:16px;background:#fff;border-radius:6px;border:1px solid #E0E0E0;"> |
| <div style="font-size:14px;font-weight:700;color:#4285F4;margin-top:4px;">{len(raw_sources)} 수집</div> |
| <div style="font-size:14px;font-weight:700;color:#22AA44;">{len(verified_sources)} 검증</div> |
| <div style="font-size:14px;font-weight:700;color:#999;">{len(unverified_sources)} 제외</div> |
| <div style="font-size:9px;color:#666;margin-top:2px;">크롤링 대조</div> |
| </div> |
| </div> |
| </div> |
| <div style="padding:16px 24px;border-bottom:1px solid #E0E0E0;"> |
| <div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:8px;">📋 검사 정보</div> |
| <div style="display:flex;gap:24px;font-size:12px;"> |
| <span><span style="color:#888;">글자수</span> <b>{char_count:,}자</b></span> |
| <span><span style="color:#888;">단어수</span> <b>{word_count:,}단어</b></span> |
| <span><span style="color:#888;">검색엔진</span> <b>Google + Brave + KCI·RISS·arXiv</b></span> |
| </div> |
| </div>{gemini_summary}{vector_section} |
| <div style="padding:24px;border-bottom:1px solid #E0E0E0;"> |
| <div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:12px;">🔍 검증된 출처 ({len(all_sources)}건)</div> |
| <table style="width:100%;border-collapse:collapse;font-size:11px;"> |
| <thead> |
| <tr style="background:{HDR};color:white;"> |
| <th style="padding:10px;text-align:center;width:35px;">#</th> |
| <th style="padding:10px;text-align:center;width:55px;">소스</th> |
| <th style="padding:10px;text-align:center;width:65px;">유사도</th> |
| <th style="padding:10px;text-align:left;">출처</th> |
| <th style="padding:10px;text-align:left;width:180px;">URL</th> |
| </tr> |
| </thead> |
| <tbody>{src_rows}</tbody> |
| </table> |
| </div> |
| </div>""" |
|
|
| _prog(0.95, "완료!") |
| log_text = "\n".join(log_lines) |
|
|
| return html, log_text |