Commit 052f0308 by unknown

init

parent 4371d0fa
hi
\ No newline at end of file
## 搜整页图片和搜题
#### 1、需求
需求1、依据用户拍摄的图书整页图片,搜索es库中对应的整页图片数据。
需求2、依据用户拍摄图书某个题目图片, 搜索es库中对应的题目图片数据。
服务器地址: 61.170.32.8
开发文件路径:/home/liuxin/work/search_question
conda虚拟环境:conda activate search_book
#### 2、搜索方案
```
准备工作:
1、将习题图书拆分为整页整页的完整图片,并使用合合OCR解析整页图片,放入excel的page_ocr列。
2、将习题图书拆分为一个个独立的题目图片,并使用合合OCR解析题目图片,将ocr结果放入question_ocr列。将题目图片和题目合合eOCR结果一起输入豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量,处理为json字符串后放入excel的“question_image_embedding”列里。
3、后端将以上excel数据通过接口插入es数据库(excel文档样例:“12668836-2025.xlsx”),在原有拆书平台下载的excel文档中添加【question_ocr(题目的ocr结果),source_image_url(整页图片url),page_ocr(整页图片ocr),question_image_embedding(题目ocr和题目原图的向量)】列
搜整页图片流程:
1、用户手机拍摄图书完整页面图片并上传
2、使用合合OCR对用户拍摄的图片进行解析
3、搜整页图片api接口接收用户拍摄图片的ocr长字符串,使用长字符串在es中搜索
4、返回es搜索的top 1数据内容。
搜题目流程
1、用户手机拍摄某个习题的图片并上传。
2、使用合合OCR对用户拍摄的题目图片进行解析。
3、将题目ocr结果和题目图片作为一个整体输入豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量。
4、可只将题目图片合合OCR结果输入接口,进行搜题。也可以将1024维的向量输入接口进行搜题。
5、只使用合合ocr搜题准确率99%。使用embedding向量搜题准确率100%。
```
#### 3、接口
```python
# 1、excel数据插入es数据库接口
def insert_excel(data):
# 1、excel 数据插入es数据库
url = 'http://localhost:31001/upload/excel'
url = 'http://61.170.32.8:31001/upload/excel'
headers = { 'accept': 'application/json' }
file_path = data['file_path']
# 上传文件
with open(file_path, 'rb') as file:
files = {
'file': (file_path, file),
}
# 发送请求
response = requests.post(url, headers=headers, data=data, files=files)
# 输出响应结果
print(f"状态码: {response.status_code}")
print(f"响应内容: {response.json()}")
data = {'book_id': '12670279', 'book_name': '衔接教材地理', "file_path": '12670279-2025版《锦上添花(期末大赢家)》下册.xlsx' }
insert_excel(data)
```
```python
# 2、依据 book_id 删除es库中book_id相同的所有数据 接口
def delete_es_book(book_id="12663121"):
# 2、指定book_id 删除es中对应的图书数据
url = 'http://localhost:31001/delete_book'
url = 'http://61.170.32.8:31001/delete_book'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
res = json.dumps(res, ensure_ascii=False, indent=4)
print(res)
book_id = "12667382"
delete_es_book(book_id)
```
```python
# 3、搜整页图片接口
def search_book_question(book_id, image_ocr):
# 3、搜书搜题
url = 'http://localhost:31001/search_page'
url = 'http://61.170.32.8:31001/search_page'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"image_ocr": image_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
book_id = "12664728"
image_ocr = "用户拍摄图片整页的合合ocr结果"
res = search_book_question(book_id, image_ocr)
print(res)
```
#### 4、搜题接口 (只用题目合合OCR结果)
```
# 4、搜题(只用题目paddleOCR结果)接口
def search_question_text(book_id, question_ocr):
url = 'http://localhost:31001/search_question_text'
# url = 'http://61.170.32.8:31001/search_question_text'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"question_ocr": question_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
book_id = "12667382"
question_ocr = "题目图片合合ocr结果"
search_question_text(book_id, question_ocr)
```
#### 5、搜题接口 (题目合合ocr和题目图片作为一个整体的向量)
```
# 5、搜题接口 (题目合合ocr和题目图片作为一个整体的向量)
def search_question_embedding(book_id:str, question_embedding):
url = 'http://localhost:31001/search_question_embedding'
# url = 'http://61.170.32.8:31001/search_question_embedding'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"question_embedding": question_embedding,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
book_id = "12670279"
# 豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量
question_embedding = np.random.random(1024).tolist()
question_embedding = json.dumps(question_embedding)
res=search_question_embedding(book_id, question_embedding)
```
#### 6、启动服务
```linux
# 61.170.32.8 服务器
$ cd /home/liuxin/work/search_question
$ conda activate search_book
$ nohup python -u api_service.py > api_service.log 2>&1 &
$ tail -f log/search_question.log
```
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
# !/usr/bin/env python
# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import numpy as np
import json
# 连接 Elasticsearch
es = Elasticsearch(
hosts = ["http://192.168.8.224:19200"], # 王磊搭建的新的es索引
# basic_auth=("elastic", "password") # 替换为你的认证信息
)
# 创建索引(如果不存在)
def create_index(index_name):
if not es.indices.exists(index=index_name):
index_settings = {
"mappings": {
"properties": {
"image_url": {"type": "keyword"},
"image_description": {"type": "text"},
"image_vector": {
"type": "dense_vector",
"dims": 2048, # 向量维度,根据你的向量模型调整
"index": True,
"similarity": "cosine"
}
}
}
}
es.indices.create(index=index_name, body=index_settings)
print(f"索引 {index_name} 创建成功")
# 插入单条图片数据
def insert_single_image(index_name, image_url, image_description, image_vector):
doc = {
"image_url": image_url,
"image_description": image_description,
"image_vector": image_vector
}
es.index(index=index_name, document=doc)
print(f"图片 {image_url} 已插入")
# 批量插入图片数据
def insert_bulk_images(index_name, image_data_list):
actions = []
for data in image_data_list:
action = {
"_index": index_name,
"_source": {
"image_url": data["image_url"],
"image_description": data["image_description"],
"image_vector": data["image_vector"]
}
}
actions.append(action)
bulk(es, actions)
print(f"批量插入 {len(actions)} 条数据成功")
# 文本检索
def text_search(index_name, query_text, size=10):
query = {
"match": {
"image_description": query_text
}
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# 向量检索
def vector_search(index_name, query_vector, size=10):
query = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'image_vector') + 1.0",
"params": {"query_vector": query_vector}
}
}
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# 向量-文本联合检索
def hybrid_search(index_name, query_text, query_vector, text_weight=0.5, vector_weight=0.5, size=10):
query = {
"bool": {
"should": [
{
"match": {
"image_description": {
"query": query_text,
"boost": text_weight
}
}
},
{
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'image_vector') * params.vector_weight",
"params": {
"query_vector": query_vector,
"vector_weight": vector_weight
}
}
}
}
]
},
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# RRF融合函数
def rrf_fusion(text_results, vector_results, k=60):
"""
使用RRF算法融合两种检索结果
参数:
text_results: es文本检索结果
vector_results: es向量检索结果
k: RRF算法中的rank常数,一般取60
返回:
融合后的结果列表
"""
# 构建文档ID到文本检索排名的映射
text_rank_map = {hit['_id']: rank + 1 for rank, hit in enumerate(text_results)}
# 构建文档ID到向量检索排名的映射
vector_rank_map = {hit['_id']: rank + 1 for rank, hit in enumerate(vector_results)}
# 合并所有文档ID
all_doc_ids = set(text_rank_map.keys()).union(set(vector_rank_map.keys()))
# 计算RRF分数
rrf_scores = {}
for doc_id in all_doc_ids:
text_rank = text_rank_map.get(doc_id, float('inf'))
vector_rank = vector_rank_map.get(doc_id, float('inf'))
# RRF公式: 1/(k + rank)
text_score = 1 / (k + text_rank) if text_rank != float('inf') else 0
vector_score = 1 / (k + vector_rank) if vector_rank != float('inf') else 0
rrf_scores[doc_id] = text_score + vector_score
# 按RRF分数排序
sorted_doc_ids = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
# 构建最终结果
final_results = []
# 优先从文本结果中获取完整信息
id_to_hit = {hit['_id']: hit for hit in text_results}
# 补充向量结果中的文档
for hit in vector_results:
if hit['_id'] not in id_to_hit:
id_to_hit[hit['_id']] = hit
for doc_id, score in sorted_doc_ids:
if doc_id in id_to_hit:
result = {
'_id': doc_id,
'_score': score,
'_source': id_to_hit[doc_id]['_source']
}
final_results.append(result)
return final_results
# RRF向量-文本联合检索
def hybrid_search_rrf(index_name, query_text, query_vector, size=10, rrf_k=60):
"""使用RRF融合文本和向量检索结果"""
# 执行文本和向量检索
text_hits = text_search(index_name, query_text)
vector_hits = vector_search(index_name, query_vector)
# 使用RRF融合结果
fused_results = rrf_fusion(text_hits, vector_hits, k=rrf_k)
# 返回前size个结果
return fused_results[:size]
# 示例用法
if __name__ == "__main__":
INDEX_NAME = "tst_images_text"
# # 1、创建索引
create_index(INDEX_NAME)
# 示例数据
sample_images = [
{
"image_url": "https://oss.5rs.me/oss/upload/image/jpeg/b2f3fbaa98a3f2552f2dba7451decf7b_36_20250605135114454.jpeg",
"image_description": "、填一填。培优作业18 80 78 2.比 16 少9的数是( )多8的数是 17;27 );比( 与 30的和是( );47比3多( )个十和( )个一;7个十和2个一 3.83 里面有( 合起来是( ),它们 4.最大的两位数是(),最小的三位数是(相差(5.写一写,比一比。百位十位个位百位十位个位百位十位个位选择。(把正确答案的序号填在括号里)1.根据艾蒿叶子的影子判断生长时间最短的叶子是(2.在远古时代,为了记录猎物的多少,人们常用石子计数。假设用0表示10.用一表示1,那么下面表示的数最大的是",
"image_vector": np.random.random(2048).tolist() # 替换为实际的图片向量
},
{
"image_url": "https://oss.5rs.me/oss/upload/image/jpeg/9579d101ec560c85f204fa11b7a792c0_2_20250605094300322.jpeg",
"image_description": "、直接写得数。培优作业1*18 x10=50 x90 =18 x41~51 x22~30x40 =12 x70 =32 x49 ~17 x78 ~20 x15 =44 x20 =78 x11~46 x23 ~二、填一填。1.13的30倍是(),10个22的和是()位数,积大约是(2.49 x28 的积是()。3.25x40的积的末尾有()个0。4.小亮平均每分钟走69米,他从家走到学校用了12分钟。小亮家离学校大约有()米。5.晶晶平均每分钟可以写25个字,照这样的速度,她15分钟可以写)个字。三、用竖式计算,并验算。18 x51 =32 x60 =46 x25 =四、在( )里填上“ >”“<”或“=”30 x30( )100070x12()80034 x56)36 x5415 x40( )60011 x44(訕興癪9 x13)450)13 x8921 x3425 x801圧ʌ帳八廈仈 x30)600)200023 x45",
"image_vector": np.random.random(2048).tolist() # 替换为实际的图片向量
}
]
# # 2、批量插入
# insert_bulk_images(INDEX_NAME, sample_images)
# # 3、文本检索示例
print("\n3、文本检索结果:")
text_results = text_search(INDEX_NAME, "填一填")
for hit in text_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 4、向量检索示例
print("\n4、向量检索结果:")
vector_results = vector_search(INDEX_NAME, sample_images[0]["image_vector"])
for hit in vector_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 5、联合检索示例
print("\n5、联合检索结果:")
hybrid_results = hybrid_search(
INDEX_NAME,
"可爱的动物",
sample_images[0]["image_vector"],
text_weight=0.4,
vector_weight=0.6
)
for hit in hybrid_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 6、RRF联合检索示例
print("\n6、RRF联合检索结果:")
hybrid_results = hybrid_search_rrf(
INDEX_NAME,
"填一填。培优作业18 80 78 2.比 16 少9的数是( )多8的数是 17;27 );比( 与 30的和是( );47比3多( )个十和( )个一;7个十和2个",
sample_images[0]["image_vector"],
size=10,
rrf_k=60
)
for hit in hybrid_results:
print(f"得分: {hit['_score']:.4f}, \n描述: {hit['_source']['image_description']}")
print()
print("\nfinished.")
from tritonclient import grpc as triton_grpc
import numpy as np
import json
from PIL import Image
import io
import base64
import time
INPUT_NAME = "input"
OUTPUT_NAME = "output"
def _create_triton_input(data):
data = json.dumps(data, separators=(",", ":"))
data = data.encode("utf-8")
data = [[data]]
data = np.array(data, dtype=np.object_)
return data
def _parse_triton_output(data):
data = data[0, 0]
data = data.decode("utf-8")
data = json.loads(data)
return data
def triton_request(client, data, *, request_kwargs=None):
if request_kwargs is None:
request_kwargs = {}
input_ = triton_grpc.InferInput(INPUT_NAME, [1, 1], "BYTES")
input_.set_data_from_numpy(_create_triton_input(data))
results = client.infer('ocr', inputs=[input_], **request_kwargs)
output = results.as_numpy(OUTPUT_NAME)
return _parse_triton_output(output)
def infer_paddle_ocr(im:Image.Image, client):
buffer = io.BytesIO()
im.save(buffer, format='JPEG')
buffer.seek(0)
base64_file = base64.b64encode(buffer.read()).decode("ascii")
input_ = {"file": base64_file,
"fileType": 1}
while True:
output_ocr = triton_request(client, input_)
errorCode = output_ocr["errorCode"]
if errorCode == 0:
result = output_ocr["result"]
res_ocr = result["ocrResults"][0]['prunedResult']
db_boxes = res_ocr['rec_boxes']
rec_texts = res_ocr['rec_texts']
break
return db_boxes, rec_texts
if __name__=='__main__':
start_time = time.time()
paddle_ocr_url = '61.170.32.8:38896'
client_ocr = triton_grpc.InferenceServerClient(paddle_ocr_url)
im = '/data/liyaze/doc2xkey_server_v2/utils/my-obs-object-key-demo.jpg'
im = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册/1.jpg"
im = "/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册/93.jpg"
im = Image.open(im).convert('RGB')
db_boxes, rec_texts = infer_paddle_ocr(im, client_ocr)
print(f"耗时:{time.time()-start_time}")
print(rec_texts)
\ No newline at end of file
import re,json,sys,os
import pandas as pd
import numpy as np
class prepareData():
def __init__(self):
self.index = "index" # int
self.fileId = "bankId" # int
self.question_image_url = "question_image_url"
self.question_text = "question_text"
self.answer_image_url = "answer_image_url"
self.answer_text = "answer_text"
self.question_chapter = "question_chapter"
self.question_category = "question_category"
self.page_id = "page_id" # int
self.book_page_id = "book_page_id"
self.knowledge_point = "knowledge_point"
self.knowledge_point_code = "knowledge_point_code"
self.difficult = "difficult"
self.resolve = "resolve"
self.appendix = "appendix"
self.source_image_url = "source_image_url"
self.page_ocr = "page_ocr"
self.question_ocr = 'question_ocr'
self.title2tile = {
"序号": self.index,
"题目id": self.fileId,
"题目图片地址": self.question_image_url,
"题目文本": self.question_text,
"答案图片地址": self.answer_image_url,
"答案文本": self.answer_text,
"所属章节": self.question_chapter,
"题目类型": self.question_category,
"电子页码": self.page_id,
"纸书页码": self.book_page_id,
"知识点": self.knowledge_point,
"知识点代码": self.knowledge_point_code,
"难度": self.difficult,
"解析": self.resolve,
"附件": self.appendix,
"source_image_url": self.source_image_url,
"page_ocr": self.page_ocr,
"question_ocr": self.question_ocr,
}
def clean(self, text):
text = re.sub(r'<img[^>]*>', '', text)
text = re.sub(r"</p><p>", " ", text)
text = re.sub(r"<p>", " ", text)
text = re.sub(r"</p>", " ", text)
text = text.strip()
return text
def read_excel(self,excel_path, sheet_name="Sheet1", book_id=None, book_name=None):
data = pd.read_excel(excel_path, sheet_name)
data = data.rename(columns=self.title2tile)
data.fillna("")
data = data.to_dict(orient='records')
page_id2bank_id = {} # 整页的所有题目
for item in data:
item[self.index] = int(item[self.index])
item[self.fileId] = int(item[self.fileId])
item[self.question_image_url] = str(item[self.question_image_url])
item[self.question_text] = str(item[self.question_text])
item[self.answer_image_url] = str(item[self.answer_image_url])
item[self.answer_text] = str(item[self.answer_text])
item[self.question_chapter] = str(item[self.question_chapter])
item[self.question_category] = str(item[self.question_category])
page_id = int(item[self.page_id])
item[self.page_id] = int(item[self.page_id])
item[self.book_page_id] = str(item[self.book_page_id])
item[self.knowledge_point] = str(item[self.knowledge_point])
item[self.knowledge_point_code] = str(item[self.knowledge_point_code])
item[self.difficult] = str(item[self.difficult])
item[self.resolve] = str(item[self.resolve])
item[self.appendix] = str(item[self.appendix])
item['book_id'] = str(book_id)
item['book_name'] = book_name
item['book_id_page_id']=str(book_id)+"-"+str(page_id)
item.pop('question_category')
item.pop('appendix')
# 1、整页图片的paddleOCR结果
# page_ocr = json.loads(item[self.page_ocr])['rec_texts'] # paddleOCR
# page_ocr = " ".join(page_ocr)
page_ocr = item[self.page_ocr] # 合合OCR
item[self.page_ocr] = page_ocr
# 2、题目的paddleOCR结果
# question_ocr = json.loads(item[self.question_ocr])['rec_texts'] # paddleOCR
# question_ocr = " ".join(question_ocr)
question_ocr = item[self.question_ocr] # 合合OCR
item[self.question_ocr] = question_ocr
# 3、整页图片所有题目id
bank_id = int(item[self.fileId])
if page_id in page_id2bank_id:
page_id2bank_id[page_id].append(bank_id)
else:
page_id2bank_id[page_id] = [bank_id]
# 4、原始题目 图片 embedding
if "question_image_embedding" in item.keys():
item['question_image_embedding'] = json.loads(item['question_image_embedding'])
# 5、整页 图片 embedding
if "page_image_embedding" in item.keys():
item['page_image_embedding'] = json.loads(item['page_image_embedding'])
for item in data:
page_id = int(item[self.page_id])
bank_ids = page_id2bank_id[page_id] # [12,2134,321]
item['page_id2bank_ids'] = bank_ids # 整页图片所有题目id
return data
def write2json(self, path, data_list):
with open(path, "w", encoding='utf-8') as f:
data_list = json.dumps(data_list, ensure_ascii=False, indent=4)
f.write(data_list)
print(f"success save: {path} ")
def result(self):
excel_file = "/home/liuxin/work/search_question/data/12671977-暑假生活·学期总复习八年级物理通用版6-原图/12671977-暑假生活·学期总复习八年级物理通用版6.xlsx"
sheet_name = "题库数据"
book_id = 12671977
book_name = '暑假生活·学期总复习八年级物理通用版6'
data_list = self.read_excel( excel_file, sheet_name=sheet_name, book_id=book_id, book_name=book_name)
save_json_path = excel_file+".json"
self.write2json(save_json_path, data_list)
if __name__ == "__main__":
prepare_data = prepareData()
prepare_data.result()
print("\nfinished.")
This diff is collapsed. Click to expand it.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment