Commit 052f0308 by unknown

init

parent 4371d0fa
hi
\ No newline at end of file
## 搜整页图片和搜题
#### 1、需求
需求1、依据用户拍摄的图书整页图片,搜索es库中对应的整页图片数据。
需求2、依据用户拍摄图书某个题目图片, 搜索es库中对应的题目图片数据。
服务器地址: 61.170.32.8
开发文件路径:/home/liuxin/work/search_question
conda虚拟环境:conda activate search_book
#### 2、搜索方案
```
准备工作:
1、将习题图书拆分为整页整页的完整图片,并使用合合OCR解析整页图片,放入excel的page_ocr列。
2、将习题图书拆分为一个个独立的题目图片,并使用合合OCR解析题目图片,将ocr结果放入question_ocr列。将题目图片和题目合合eOCR结果一起输入豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量,处理为json字符串后放入excel的“question_image_embedding”列里。
3、后端将以上excel数据通过接口插入es数据库(excel文档样例:“12668836-2025.xlsx”),在原有拆书平台下载的excel文档中添加【question_ocr(题目的ocr结果),source_image_url(整页图片url),page_ocr(整页图片ocr),question_image_embedding(题目ocr和题目原图的向量)】列
搜整页图片流程:
1、用户手机拍摄图书完整页面图片并上传
2、使用合合OCR对用户拍摄的图片进行解析
3、搜整页图片api接口接收用户拍摄图片的ocr长字符串,使用长字符串在es中搜索
4、返回es搜索的top 1数据内容。
搜题目流程
1、用户手机拍摄某个习题的图片并上传。
2、使用合合OCR对用户拍摄的题目图片进行解析。
3、将题目ocr结果和题目图片作为一个整体输入豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量。
4、可只将题目图片合合OCR结果输入接口,进行搜题。也可以将1024维的向量输入接口进行搜题。
5、只使用合合ocr搜题准确率99%。使用embedding向量搜题准确率100%。
```
#### 3、接口
```python
# 1、excel数据插入es数据库接口
def insert_excel(data):
# 1、excel 数据插入es数据库
url = 'http://localhost:31001/upload/excel'
url = 'http://61.170.32.8:31001/upload/excel'
headers = { 'accept': 'application/json' }
file_path = data['file_path']
# 上传文件
with open(file_path, 'rb') as file:
files = {
'file': (file_path, file),
}
# 发送请求
response = requests.post(url, headers=headers, data=data, files=files)
# 输出响应结果
print(f"状态码: {response.status_code}")
print(f"响应内容: {response.json()}")
data = {'book_id': '12670279', 'book_name': '衔接教材地理', "file_path": '12670279-2025版《锦上添花(期末大赢家)》下册.xlsx' }
insert_excel(data)
```
```python
# 2、依据 book_id 删除es库中book_id相同的所有数据 接口
def delete_es_book(book_id="12663121"):
# 2、指定book_id 删除es中对应的图书数据
url = 'http://localhost:31001/delete_book'
url = 'http://61.170.32.8:31001/delete_book'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
res = json.dumps(res, ensure_ascii=False, indent=4)
print(res)
book_id = "12667382"
delete_es_book(book_id)
```
```python
# 3、搜整页图片接口
def search_book_question(book_id, image_ocr):
# 3、搜书搜题
url = 'http://localhost:31001/search_page'
url = 'http://61.170.32.8:31001/search_page'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"image_ocr": image_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
book_id = "12664728"
image_ocr = "用户拍摄图片整页的合合ocr结果"
res = search_book_question(book_id, image_ocr)
print(res)
```
#### 4、搜题接口 (只用题目合合OCR结果)
```
# 4、搜题(只用题目paddleOCR结果)接口
def search_question_text(book_id, question_ocr):
url = 'http://localhost:31001/search_question_text'
# url = 'http://61.170.32.8:31001/search_question_text'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"question_ocr": question_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
book_id = "12667382"
question_ocr = "题目图片合合ocr结果"
search_question_text(book_id, question_ocr)
```
#### 5、搜题接口 (题目合合ocr和题目图片作为一个整体的向量)
```
# 5、搜题接口 (题目合合ocr和题目图片作为一个整体的向量)
def search_question_embedding(book_id:str, question_embedding):
url = 'http://localhost:31001/search_question_embedding'
# url = 'http://61.170.32.8:31001/search_question_embedding'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"question_embedding": question_embedding,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
book_id = "12670279"
# 豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量
question_embedding = np.random.random(1024).tolist()
question_embedding = json.dumps(question_embedding)
res=search_question_embedding(book_id, question_embedding)
```
#### 6、启动服务
```linux
# 61.170.32.8 服务器
$ cd /home/liuxin/work/search_question
$ conda activate search_book
$ nohup python -u api_service.py > api_service.log 2>&1 &
$ tail -f log/search_question.log
```
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
import os,re,json
from typing import Optional
from pathlib import Path
from pydantic import BaseModel
import uvicorn
import logging
from save_es_database import EsHelper
from prepare_data import prepareData
__path__ = os.path.dirname(os.path.abspath(__file__))
pre_data = prepareData() # 准备写入es的图书数据
app = FastAPI(title="搜图书整页搜题")
es = EsHelper()
index_name = "search_question" # rays 搜书搜题
# 配置上传文件夹和允许的文件类型
UPLOAD_DIR = Path("uploads")
ALLOWED_EXTENSIONS = {"xlsx"}
print("UPLOAD_DIR: ", UPLOAD_DIR)
# 确保上传文件夹存在
UPLOAD_DIR.mkdir(exist_ok=True)
embed_dim = 1024 # es库中图片向量的维度
# 定义将excel数据插入es库的 响应结构
class UploadResponse(BaseModel):
status: str
message: str
filename: str
file_path: str
book_id: str
book_name: str
# 搜整页图片请求数据结构
class SearchPage(BaseModel):
book_id: str
image_ocr: str
top_k: int = 1
# 搜题目请求数据结构(仅输入题目图片的OCR文本)
class SearchQuestionText(BaseModel):
book_id: str
image_ocr: str = None # 整页图片的paddleOCR结果,先确定题目所在的页再确定具体是那个题目 (题目大概率位于当前页,残题可能位于上一页)
question_ocr: str # 题目的paddleOCR结果
top_k: int = 1
# 搜题目请求数据结构(仅输入题目图片的向量)
class SearchQuestionEmbedding(BaseModel):
book_id: str
question_embedding: str # 题目的图片的embedding ; json.dumps([])
top_k: int = 1
# 搜题目请求数据结构(输入题目图片的向量 和题目图片的OCR结果)
class SearchQuestionTextAndEmbedding(BaseModel):
book_id: str
question_ocr: str # 题目的paddleOCR结果
question_embedding: str # 题目的图片的embedding json.dumps([])
top_k: int = 1
class DeleteBook(BaseModel):
book_id: str
# 检查文件扩展名是否允许
def allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
def setup_logger():
# 创建logger对象(单例,按名称唯一)
logger = logging.getLogger('search_question')
# 避免重复初始化:如果已有处理器,直接返回
if logger.handlers:
return logger
logger.setLevel(logging.INFO)
# 禁用日志传播到父Logger
logger.propagate = False
# 日志目录路径(使用脚本所在目录,替代__path__)
current_dir = os.path.dirname(os.path.abspath(__file__))
log_dir = os.path.join(current_dir, "log")
if not os.path.exists(log_dir):
os.makedirs(log_dir, exist_ok=True) # 增加exist_ok避免多线程冲突
log_file = os.path.join(log_dir, "search_question.log")
# 创建文件处理器(延迟打开文件)
file_handler = logging.FileHandler(log_file, encoding='utf-8', delay=True)
file_handler.setLevel(logging.INFO)
# 设置日志格式
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
# 添加处理器
logger.addHandler(file_handler)
return logger
# 创建全局logger实例
logger_es = setup_logger()
# 1、文件上传API(接收拆书传入的excel文件)
@app.post("/upload/excel", response_model=UploadResponse)
async def upload_excel(
book_id: str = Form(...), # 必填字段
book_name: str = Form(...), # 必填字段
file: UploadFile = File(...)
):
# 检查文件类型是否允许
if not allowed_file(file.filename):
logger_es.info(f"/upload/excel : 文件类型不允许。允许的类型: {', '.join(ALLOWED_EXTENSIONS)}")
raise HTTPException(
status_code=400,
detail=f"文件类型不允许。允许的类型: {', '.join(ALLOWED_EXTENSIONS)}"
)
# 1、生成安全的文件名
file_ext = os.path.splitext(file.filename)[1]
safe_filename = f"{book_id}_{book_name}{file_ext}"
file_path = UPLOAD_DIR / safe_filename
# 2、保存excel文件
try:
with open(file_path, "wb") as f:
contents = await file.read()
f.write(contents)
except Exception as e:
logger_es.info(f"/upload/excel : 保存excel文件时出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"保存excel文件时出错: {str(e)};book_name:{book_name}")
finally:
await file.close()
# 3、保存清洗后的excel进json文件
try:
sheet_name = "题库数据"
data = pre_data.read_excel(file_path, sheet_name, book_id, book_name)
safe_filename_json = f"{book_id}_{book_name}.json"
file_path_json = UPLOAD_DIR / safe_filename_json
pre_data.write2json(file_path_json, data) # excel保存为json数据
except Exception as e:
logger_es.info(f"/upload/excel : 清洗excel数据出错: {str(e)}")
raise HTTPException(status_code=520, detail=f"清洗excel数据出错: {str(e)};book_name:{book_name}")
# 4、将拆书数据插入es数据库
try:
es.insert_doc_batch(index_name, data)
except Exception as e:
logger_es.info(f"/upload/excel : 数据插入es出错: {str(e)}")
raise HTTPException(status_code=530, detail=f"数据插入es出错: {str(e)}")
logger_es.info(f"/upload/excel : 数据插入es success: book_id:{str(book_id)};book_name:{book_name};file: {safe_filename}")
return {
"status": "success",
"message": "文件插入 ES 成功",
"filename": safe_filename,
"file_path": str(file_path),
"book_id": book_id,
"book_name": book_name
}
# 2、依据book_id删除es库里对应的数据
@app.post("/delete_book")
async def delete_book(input: DeleteBook):
book_id = input.book_id
logger_es.info(f"delete_book : {input}")
try:
es.delete_book_id(index_name, book_id)
except Exception as e:
logger_es.info(f"delete_book : 删除 book_id:{book_id} 数据出错:{e}。")
raise HTTPException(status_code=500, detail=f"删除 book_id:{book_id} 数据出错:{e}。")
return { "status": "success", "msg":f"成功删除 book_id:{book_id} 图书数据。"}
# 3、搜图书的整页
@app.post("/search_page")
async def search_page(input: SearchPage):
logger_es.info(f"search_page : {input}")
book_id = input.book_id
image_ocr = input.image_ocr
top_k = input.top_k
try:
res = await es.search_page( index_name, book_id, image_ocr, top_k)
except Exception as e:
logger_es.info(f"search_page : 搜索 book_id:{book_id},image_ocr:{image_ocr},出错:{e}。")
raise HTTPException(status_code=500, detail=f"search_page 接口 book_id:{book_id},image_ocr:{image_ocr},出错:{e}。")
return {"status": "success", "es_search":res}
# 3、搜图书中的某个题目(只使用题目图片的OCR结果)
@app.post("/search_question_text")
async def search_question_text(input: SearchQuestionText):
book_id = input.book_id
image_ocr = input.image_ocr
question_ocr = input.question_ocr
top_k = input.top_k
if image_ocr: # 如果输入了整页图片的paddleOCR结果,也输入了题目的paddleOCR结果;先确定题目所在的页,再搜索题目
try:
logger_es.info(f"search_question :image_ocr 非空: {input}")
res = await es.search_page_question(index_name, book_id, image_ocr, question_ocr, top_k)
except Exception as e:
logger_es.info( f"search_question : 搜索 book_id:{book_id},image_ocr 非空:{image_ocr}; question_ocr:{question_ocr},出错:{e}。")
raise HTTPException(status_code=502, detail=f"search_question 接口 book_id:{book_id},image_ocr:{image_ocr},question_ocr:{question_ocr},出错:{e}。")
else:
try:
logger_es.info(f"search_question : image_ocr 为空 : {input}")
res = await es.search_question( index_name, book_id, question_ocr, top_k)
except Exception as e:
logger_es.info(f"search_question : 搜索 book_id:{book_id},image_ocr 为空:{image_ocr}; question_ocr:{question_ocr},出错:{e}。")
raise HTTPException(status_code=500, detail=f"search_question 接口 book_id:{book_id},image_ocr:{image_ocr},question_ocr:{question_ocr},出错:{e}。")
return {"status": "success", "es_search":res}
# 4、搜图中的某个题目(只使用题目图片和ocr的向量)
@app.post("/search_question_embedding")
async def search_question_embedding(input: SearchQuestionEmbedding):
logger_es.info(f"search_question_embedding : book_id: {input.book_id}")
book_id = input.book_id
question_embedding = input.question_embedding
top_k = input.top_k
try:
question_embedding = json.loads(question_embedding)
res = await es.search_question_embedding(index_name, book_id, question_embedding, top_k)
except Exception as e:
logger_es.info(f"search_question_embedding : 搜索 book_id:{book_id},出错:{e}。")
raise HTTPException(status_code=500, detail=f"search_question_embedding 接口 book_id:{book_id},出错:{e}。")
return {"status": "success", "es_search": res}
# 5、搜图中某个题目(联合文本搜索和向量搜索)
@app.post("/search_question_text_and_embedding")
async def search_question_text_and_embedding(input: SearchQuestionTextAndEmbedding):
logger_es.info(f"search_question_text_and_embedding : {input.book_id}")
book_id = input.book_id
question_ocr = input.question_ocr
question_embedding = input.question_embedding
top_k = input.top_k
try:
question_embedding = json.loads(question_embedding)
res = await es.search_question_textAndEmbedding(index_name, book_id, question_ocr, question_embedding, top_k)
except Exception as e:
logger_es.info(f"search_question_text_and_embedding : 搜索 book_id:{book_id},出错:{e}。")
raise HTTPException(status_code=500, detail=f"search_question_text_and_embedding 接口 book_id:{book_id},出错:{e}。")
return {"status": "success", "es_search": res}
if __name__ == "__main__":
uvicorn.run(app="api_service:app", host="0.0.0.0", port=31001, workers=1) # 部署的服务是 31001
# cd /home/liuxin/work/search_question # 61.170.32.8 服务器
# conda activate search_book
# nohup python -u api_service.py > api_service.log 2>&1 &
# tail -f log/search_question.log
import requests
import re,json,os,sys
def get_page_image_url(bankId="760261"):
# 输入题目的bankid 输出题目所在整页的url链接
url = "https://rays7.5rs.me/matrix/v1.0/aIRecognized/getPageInfoByBankId"
headers = {
"token": "whlg2025!",
"Only-For-Backend": "y"
}
data = {
"bankId": bankId
}
response = requests.get(url, headers=headers, params=data)
response = response.text
sourceImageUrl = json.loads(response)['data']['sourceImageUrl']
return sourceImageUrl
# get_page_image_url()
def insert_excel(data):
# 1、excel 数据插入es数据库
url = 'http://localhost:31001/upload/excel'
# url = 'http://61.170.32.8:31001/upload/excel'
headers = { 'accept': 'application/json' }
file_path = data['file_path']
# 上传文件
with open(file_path, 'rb') as file:
files = {
'file': (file_path, file),
}
# 发送请求
response = requests.post(url, headers=headers, data=data, files=files)
# 输出响应结果
print(f"状态码: {response.status_code}")
print(f"响应内容: {response.json()}")
def delete_es_book(book_id="12663121"):
# 2、指定book_id 删除es中对应的图书数据
url = 'http://localhost:31001/delete_book'
# url = 'http://61.170.32.8:31001/delete_book'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
res = json.dumps(res, ensure_ascii=False, indent=4)
print(res)
def search_page(book_id, image_ocr):
# 3、搜书图书的整页
url = 'http://localhost:31001/search_page'
# url = 'http://61.170.32.8:31001/search_page'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"image_ocr": image_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
def search_question_text(book_id, question_ocr, image_ocr=""):
# 3、搜题(只用题目的ocr结果进行搜题)
url = 'http://localhost:31001/search_question_text'
# url = 'http://61.170.32.8:31001/search_question_text'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"image_ocr": image_ocr,
"question_ocr": question_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
def search_question_embedding(book_id, question_embedding):
# 4、搜题(只用题目的ocr和图片的embedding数据)
url = 'http://localhost:31001/search_question_embedding'
# url = 'http://61.170.32.8:31001/search_question_embedding'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"question_embedding": question_embedding,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
def search_question_text_and_embedding(book_id, question_ocr, question_embedding):
# 5、搜题(使用题目ocr的文本搜题,再使用题目ocr和图片整体的向量搜题,联合两个搜索的得分)
url = 'http://localhost:31001/search_question_text_and_embedding'
# url = 'http://61.170.32.8:31001/search_question_text_and_embedding'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"question_ocr": question_ocr,
"question_embedding": question_embedding,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
if __name__ == "__main__":
# 1、es中插入excel数据
# data = {'book_id': '12663121', 'book_name': '衔接教材地理', "file_path": '/home/liuxin/work/search_question/data/12663121-衔接教材地理-原图/12663121-衔接教材地理.xlsx' } # 64
# data = {'book_id': '12667382', 'book_name': '25春北师一数暑假作业', "file_path": '/home/liuxin/work/search_question/data/12667382-25春北师一数暑假作业-原图/12667382-25春北师一数暑假作业.xlsx'} # 44
data = {'book_id': '12664728', 'book_name': '课计划·七年级英语·RJ·上册(出血文件)', "file_path": '/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册(出血文件)-原图/12664728-课计划·七年级英语·RJ·上册(出血文件).xlsx'} # 96
data = {'book_id': '12670279', 'book_name': '2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8', "file_path": '/home/liuxin/work/search_question/data/12670279-12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8-原图/12670279-12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8.xlsx'} # 26
data = {'book_id': '12677472', 'book_name': '衔接教材地理', "file_path": '/home/liuxin/work/search_question/data/12677472-语文1-160-原图/12677472-语文1-160.xlsx'}
data = {'book_id': '12677471', 'book_name': '卷1-132', "file_path": '/home/liuxin/work/search_question/data/12677471-卷1-132-原图/12677471-卷1-132.xlsx'}
data = {'book_id': '12677035', 'book_name': '2025《创新教程》新高考试题精选生物学', "file_path": '/home/liuxin/work/search_question/data/12677035-2025《创新教程》新高考试题精选生物学-原图/12677035-2025《创新教程》新高考试题精选生物学.xlsx'}
data = {'book_id': '12668851', 'book_name': '高二暑假作业化学', "file_path": '/home/liuxin/work/search_question/data/12668851-高二暑假作业化学-原图/12668851-高二暑假作业化学.xlsx'}
data = {"book_id": "12671977", "book_name":"暑假生活·学期总复习八年级物理通用版6", "file_path": "/home/liuxin/work/search_question/data/12671977-暑假生活·学期总复习八年级物理通用版6-原图/12671977-暑假生活·学期总复习八年级物理通用版6.xlsx"}
# print("\n1、es中插入excel数据")
# insert_excel(data)
# 1.2、插入整个文件夹的数据进入es
path_dir = "/home/liuxin/work/search_question/data/拆书平台下载excel文件_添加字段内容"
file_names = os.listdir(path_dir)
file_names = [file_name for file_name in file_names if file_name.endswith(".xlsx")]
print(f"文件夹 {path_dir} 共{len(file_names)}个excel文件")
file_names = [file_name for file_name in file_names if "-" in file_name]
print(f"文件夹 {path_dir} 共{len(file_names)}个excel文件")
book_ids = [file_name.split("-")[0] for file_name in file_names ]
for book_id, file_name in zip(book_ids, file_names):
file = os.path.join(path_dir, file_name)
input = {"book_id":book_id, "book_name":file_name[:-5], "file_path":file}
insert_excel(input)
# # 2、指定 book_id 删除es中对应的数据
# print("\n2、指定 book_id 删除es中对应的数据")
book_id = "12664728"
book_id = "12670279"
book_id = "12667382"
book_id = "12663121"
# delete_es_book(book_id)
# # # # # 3、搜书图书的整页
# print("\n3、搜书图书的整页")
# book_id = "12671977"
# image_ocr = "物理"
# res = search_page(book_id, image_ocr)
# print(res)
# # # 4、搜题 只搜索文本数据
print("\n4、搜题 只搜索文本数据")
book_id = "12667382"
question_ocr = "物理10"
image_ocr = ""
# res = search_question_text(book_id, question_ocr, image_ocr=image_ocr)
# print(res)
# 5、搜题 只搜索题目向量
print("\n5、搜题 只搜索题目向量")
book_id = "12667382"
question_embedding = [
0.00543212890625,
-0.005035400390625,
0.00714111328125,
0.06591796875,
-0.032958984375,
-0.0380859375,
-0.0255126953125,
-0.046630859375,
0.01031494140625,
0.036376953125,
0.013916015625,
0.0255126953125,
-0.0107421875,
-0.04345703125,
0.0008697509765625,
0.02587890625,
-0.0201416015625,
-0.02099609375,
-0.00830078125,
0.033203125,
-0.01007080078125,
0.0224609375,
-0.04931640625,
0.0206298828125,
0.009765625,
0.02392578125,
0.00095367431640625,
0.03369140625,
0.005035400390625,
0.07373046875,
0.0262451171875,
0.015625,
-0.021728515625,
0.0054931640625,
0.0308837890625,
0.055419921875,
-0.0517578125,
0.011474609375,
-0.023681640625,
0.0235595703125,
-0.00128173828125,
0.0096435546875,
0.003936767578125,
-0.01953125,
-0.05126953125,
0.001617431640625,
-0.005218505859375,
-0.01409912109375,
-0.026611328125,
0.0198974609375,
0.02001953125,
0.004974365234375,
0.053955078125,
-0.0634765625,
-0.0654296875,
-0.03369140625,
-0.08154296875,
0.06005859375,
0.0223388671875,
-0.044921875,
-0.0283203125,
0.01483154296875,
0.150390625,
0.026611328125,
0.02392578125,
-0.00125885009765625,
-0.053955078125,
-0.03759765625,
-0.0390625,
0.02734375,
-0.01385498046875,
0.00140380859375,
0.0218505859375,
-0.01025390625,
-0.0205078125,
0.04638671875,
0.0145263671875,
-0.00927734375,
0.01025390625,
-0.0225830078125,
-0.00909423828125,
-0.0142822265625,
0.02978515625,
0.0101318359375,
-0.072265625,
-0.049560546875,
0.0311279296875,
0.0101318359375,
-0.00445556640625,
-0.03369140625,
0.0615234375,
-0.021240234375,
-0.034912109375,
-0.004150390625,
0.07177734375,
0.0262451171875,
-0.013671875,
0.03369140625,
0.021728515625,
0.0252685546875,
-0.00982666015625,
-0.0028228759765625,
-0.00151824951171875,
0.003753662109375,
0.004791259765625,
-0.03759765625,
0.025146484375,
0.0235595703125,
-0.0103759765625,
0.0084228515625,
-0.0191650390625,
0.044921875,
0.052001953125,
-0.053955078125,
0.0206298828125,
0.036376953125,
0.01202392578125,
0.0196533203125,
0.025146484375,
-0.000457763671875,
-0.0033721923828125,
-0.0101318359375,
0.005035400390625,
0.0089111328125,
0.002227783203125,
-0.041015625,
0.00927734375,
0.10693359375,
0.01043701171875,
0.00616455078125,
0.013916015625,
0.0035858154296875,
-0.0283203125,
-0.06591796875,
0.016845703125,
-0.0615234375,
-0.01385498046875,
-0.05029296875,
-0.01556396484375,
-0.0128173828125,
-0.0140380859375,
0.00732421875,
-0.0164794921875,
0.01239013671875,
0.0150146484375,
-0.0169677734375,
-0.142578125,
-0.0008087158203125,
-0.007171630859375,
0.0033416748046875,
0.03662109375,
-0.00341796875,
-0.0198974609375,
-0.03955078125,
-0.06640625,
-0.05712890625,
0.0133056640625,
-0.0277099609375,
-0.005462646484375,
-0.0224609375,
0.002166748046875,
0.0296630859375,
-0.030517578125,
-0.029052734375,
0.030517578125,
-0.0162353515625,
-0.0252685546875,
-0.0174560546875,
-0.023193359375,
0.005340576171875,
-0.026611328125,
-0.006011962890625,
-0.01611328125,
-0.033203125,
-0.000820159912109375,
0.04248046875,
-0.0301513671875,
0.019287109375,
0.00909423828125,
0.0400390625,
0.03564453125,
0.030517578125,
0.0140380859375,
-0.0014495849609375,
0.01251220703125,
-0.0556640625,
0.04248046875,
-0.056640625,
-0.030517578125,
-0.009033203125,
-0.01165771484375,
-0.0277099609375,
0.0087890625,
0.024658203125,
-0.043701171875,
0.0146484375,
-0.020751953125,
0.01007080078125,
0.0164794921875,
0.06591796875,
-0.06298828125,
0.0303955078125,
-0.020263671875,
-0.007171630859375,
0.004425048828125,
0.0020751953125,
-0.01373291015625,
0.050537109375,
-0.08203125,
-0.036865234375,
0.019775390625,
-0.0050048828125,
0.06640625,
-0.00823974609375,
0.04248046875,
-0.02734375,
0.004425048828125,
0.006500244140625,
-0.00537109375,
-0.0546875,
0.01470947265625,
0.0089111328125,
0.0025787353515625,
-0.00860595703125,
0.018310546875,
-0.0208740234375,
-0.06689453125,
-0.00592041015625,
-0.0303955078125,
-0.0021514892578125,
0.00372314453125,
0.0218505859375,
-0.0301513671875,
0.02587890625,
-0.01239013671875,
0.04541015625,
0.03662109375,
0.07421875,
0.0225830078125,
0.000385284423828125,
0.06396484375,
-0.0020294189453125,
0.008056640625,
0.0111083984375,
0.0081787109375,
0.033447265625,
-0.01544189453125,
0.09912109375,
-0.00994873046875,
0.0208740234375,
-0.044921875,
-0.017333984375,
0.0211181640625,
0.08056640625,
-0.02294921875,
0.062255859375,
0.01708984375,
-0.011474609375,
0.03125,
0.018798828125,
0.0019683837890625,
0.0390625,
-0.0052490234375,
-0.01312255859375,
-0.002044677734375,
-0.046630859375,
0.015625,
0.037353515625,
0.002777099609375,
-0.00811767578125,
0.0233154296875,
-0.007232666015625,
0.039794921875,
0.009765625,
0.01708984375,
0.043212890625,
0.0101318359375,
0.02783203125,
-0.021484375,
-0.01470947265625,
-0.013671875,
0.01470947265625,
-0.032470703125,
-0.0185546875,
-0.00506591796875,
0.03125,
-0.03125,
0.048095703125,
-0.01165771484375,
0.0296630859375,
0.017822265625,
-0.011474609375,
0.00787353515625,
0.005645751953125,
-0.0093994140625,
0.0380859375,
-0.017333984375,
0.0255126953125,
-0.01806640625,
0.033447265625,
-0.001739501953125,
-0.0191650390625,
0.0054931640625,
0.053466796875,
-0.0308837890625,
-0.006072998046875,
0.0179443359375,
0.025634765625,
-0.01318359375,
0.01214599609375,
-0.0040283203125,
0.03125,
0.047119140625,
0.056640625,
0.01116943359375,
-0.0235595703125,
0.00872802734375,
0.003936767578125,
-0.000652313232421875,
-0.004638671875,
-0.00341796875,
0.0035400390625,
0.0322265625,
0.0169677734375,
0.008056640625,
-0.0262451171875,
-0.080078125,
-0.04638671875,
0.017822265625,
-0.01544189453125,
-0.0296630859375,
0.033203125,
0.10693359375,
0.038330078125,
0.00286865234375,
0.04541015625,
0.021484375,
-0.04931640625,
0.01953125,
-0.035400390625,
-0.07958984375,
-0.07177734375,
-0.004119873046875,
0.0208740234375,
0.10498046875,
-0.006011962890625,
-0.04248046875,
-0.045654296875,
-0.001434326171875,
0.00994873046875,
0.0164794921875,
0.0277099609375,
0.006378173828125,
-0.0286865234375,
0.0191650390625,
-0.02587890625,
0.011474609375,
0.0703125,
0.00732421875,
-0.032470703125,
-0.0211181640625,
-0.05615234375,
0.01092529296875,
0.031494140625,
0.0247802734375,
-0.00372314453125,
-0.0013427734375,
-0.06982421875,
-0.00714111328125,
0.0196533203125,
0.017333984375,
0.01336669921875,
-0.00188446044921875,
-0.01019287109375,
-0.00836181640625,
0.035400390625,
-0.0400390625,
0.01177978515625,
0.009033203125,
0.004425048828125,
0.0157470703125,
0.0015716552734375,
0.007354736328125,
-0.037841796875,
0.01214599609375,
-0.002685546875,
-0.0233154296875,
0.0107421875,
-0.052001953125,
-0.057373046875,
0.007354736328125,
0.007171630859375,
-0.02734375,
-0.005523681640625,
-0.0010528564453125,
-0.0106201171875,
0.0022125244140625,
0.08642578125,
-0.033447265625,
0.03125,
-0.0225830078125,
-0.03125,
0.007049560546875,
-0.0712890625,
-0.0023956298828125,
0.006317138671875,
-0.056640625,
0.0206298828125,
-0.017333984375,
0.048828125,
0.0419921875,
-0.0037078857421875,
0.02978515625,
0.0159912109375,
0.04443359375,
-0.060302734375,
-0.038330078125,
0.0277099609375,
0.036865234375,
0.0322265625,
-0.006011962890625,
-0.022705078125,
-0.050537109375,
0.041748046875,
0.01507568359375,
0.007232666015625,
0.050048828125,
0.041015625,
-0.0240478515625,
0.006561279296875,
0.000301361083984375,
-0.01446533203125,
0.01507568359375,
-0.0556640625,
-0.02734375,
-0.0081787109375,
0.04736328125,
0.0025787353515625,
-0.0283203125,
0.01129150390625,
-0.0252685546875,
0.0225830078125,
0.00421142578125,
-0.004791259765625,
0.005401611328125,
0.006011962890625,
-0.057861328125,
-0.0289306640625,
0.008544921875,
-0.0115966796875,
0.022705078125,
-0.008056640625,
-0.01416015625,
0.0021209716796875,
-0.0038909912109375,
0.01025390625,
-0.0247802734375,
0.01275634765625,
0.031494140625,
-0.00015926361083984375,
-0.004119873046875,
0.041259765625,
-0.040283203125,
0.052734375,
-0.052734375,
-0.0225830078125,
-0.06884765625,
-0.07421875,
-0.0341796875,
0.0196533203125,
0.0038604736328125,
-0.041015625,
-0.005615234375,
-0.039794921875,
-0.0205078125,
0.04931640625,
-0.0289306640625,
0.006561279296875,
0.020751953125,
-0.03369140625,
0.020263671875,
-0.039794921875,
0.0081787109375,
-0.0020294189453125,
0.0218505859375,
0.01611328125,
-0.0281982421875,
-0.027099609375,
0.00131988525390625,
-0.0654296875,
0.023681640625,
-0.006256103515625,
-0.0211181640625,
-0.006103515625,
-0.034912109375,
-0.00299072265625,
-0.0059814453125,
-0.036376953125,
-0.0174560546875,
-0.056640625,
0.000316619873046875,
-0.033935546875,
0.00726318359375,
-0.033447265625,
0.06591796875,
0.0018768310546875,
0.02734375,
-0.04345703125,
0.032958984375,
-0.01953125,
-0.0034332275390625,
0.01300048828125,
0.000713348388671875,
-0.008056640625,
-0.004730224609375,
-0.0341796875,
0.030517578125,
-0.0255126953125,
0.0286865234375,
-0.04638671875,
0.025146484375,
-0.03515625,
0.001220703125,
-0.01068115234375,
0.043212890625,
-0.01416015625,
-0.0223388671875,
-0.03369140625,
-0.0059814453125,
0.007049560546875,
0.0220947265625,
-0.0281982421875,
-0.00811767578125,
-0.00799560546875,
0.0263671875,
0.01373291015625,
0.00194549560546875,
0.20703125,
-0.0076904296875,
0.06591796875,
-0.0059814453125,
0.01043701171875,
-0.01251220703125,
0.038330078125,
-0.0087890625,
0.09765625,
0.003082275390625,
0.0096435546875,
0.000152587890625,
0.02392578125,
-0.04638671875,
-0.007781982421875,
-0.0030975341796875,
-0.00098419189453125,
-0.0181884765625,
-0.007232666015625,
-0.0235595703125,
0.033447265625,
0.0125732421875,
-0.00171661376953125,
0.022216796875,
-0.000278472900390625,
-0.02392578125,
0.0294189453125,
-0.0128173828125,
0.039794921875,
-0.0247802734375,
-0.049560546875,
0.003631591796875,
0.0206298828125,
0.0164794921875,
0.028564453125,
-0.01251220703125,
0.02294921875,
-0.00099945068359375,
-0.022216796875,
-0.050537109375,
0.001861572265625,
-0.0191650390625,
-0.045654296875,
-0.023681640625,
0.004119873046875,
0.050048828125,
-0.007232666015625,
0.05029296875,
-0.027099609375,
-0.0169677734375,
0.035888671875,
-0.02001953125,
-0.0040283203125,
0.01043701171875,
0.0035400390625,
-0.0059814453125,
0.0263671875,
-0.01220703125,
-0.016357421875,
0.01483154296875,
-0.0390625,
-0.00396728515625,
0.04345703125,
0.023193359375,
-0.0172119140625,
0.021240234375,
-0.04736328125,
0.006866455078125,
0.036376953125,
-0.0289306640625,
0.0208740234375,
0.011474609375,
-0.0260009765625,
0.01904296875,
0.016357421875,
-0.04931640625,
-0.0478515625,
0.0213623046875,
-0.01275634765625,
-0.032470703125,
-0.0084228515625,
-0.001922607421875,
0.0191650390625,
0.008544921875,
-0.037353515625,
0.0111083984375,
-0.022705078125,
-0.04052734375,
0.033447265625,
-0.01116943359375,
-0.03369140625,
-0.0167236328125,
-0.0303955078125,
0.0191650390625,
-0.06591796875,
0.00433349609375,
-0.078125,
-0.00732421875,
0.04150390625,
0.00286865234375,
0.0157470703125,
-0.00008440017700195312,
0.041015625,
-0.034423828125,
0.053466796875,
0.005218505859375,
0.01239013671875,
-0.006072998046875,
-0.00628662109375,
0.018798828125,
0.00151824951171875,
0.01165771484375,
0.0164794921875,
0.0089111328125,
0.0247802734375,
0.024658203125,
-0.0031585693359375,
-0.0181884765625,
-0.0103759765625,
-0.04541015625,
0.00323486328125,
0.0157470703125,
-0.004791259765625,
0.0308837890625,
-0.03076171875,
0.00506591796875,
0.02392578125,
-0.0179443359375,
0.0269775390625,
-0.003509521484375,
-0.0181884765625,
0.020263671875,
0.03076171875,
0.00145721435546875,
-0.004547119140625,
0.0167236328125,
0.035400390625,
-0.0206298828125,
-0.0810546875,
-0.0113525390625,
-0.02294921875,
0.0022735595703125,
0.0250244140625,
-0.006561279296875,
0.0223388671875,
0.043212890625,
0.03564453125,
-0.009765625,
0.036376953125,
-0.0028228759765625,
-0.003204345703125,
-0.041748046875,
-0.001495361328125,
-0.05419921875,
-0.04248046875,
-0.01708984375,
-0.002777099609375,
-0.01251220703125,
-0.001739501953125,
-0.0084228515625,
0.00130462646484375,
-0.000579833984375,
-0.0262451171875,
0.01446533203125,
0.037841796875,
0.02001953125,
-0.0157470703125,
-0.017333984375,
0.006317138671875,
0.0125732421875,
-0.0206298828125,
0.02197265625,
0.012939453125,
-0.0162353515625,
-0.068359375,
0.0250244140625,
0.006317138671875,
0.00201416015625,
-0.005126953125,
-0.00872802734375,
-0.06591796875,
0.0233154296875,
-0.01806640625,
-0.01312255859375,
-0.042236328125,
-0.03125,
-0.0198974609375,
-0.00193023681640625,
-0.004364013671875,
-0.0262451171875,
-0.0439453125,
-0.020263671875,
-0.01043701171875,
-0.0250244140625,
0.036865234375,
0.00946044921875,
-0.012939453125,
0.0213623046875,
-0.032470703125,
-0.048095703125,
0.020263671875,
0.00482177734375,
0.048583984375,
0.0301513671875,
-0.046630859375,
-0.021728515625,
-0.00408935546875,
0.0164794921875,
-0.006591796875,
-0.01409912109375,
0.0439453125,
-0.003936767578125,
-0.053955078125,
-0.0123291015625,
-0.025146484375,
-0.0185546875,
-0.061767578125,
0.0045166015625,
-0.01348876953125,
-0.025634765625,
0.0013580322265625,
-0.033447265625,
-0.0093994140625,
-0.039794921875,
0.04638671875,
-0.00799560546875,
-0.03955078125,
-0.015869140625,
0.0322265625,
0.0093994140625,
0.02978515625,
0.005828857421875,
-0.00011873245239257812,
-0.003936767578125,
0.0274658203125,
0.00701904296875,
0.000858306884765625,
-0.0146484375,
0.020263671875,
0.04150390625,
0.0216064453125,
-0.0164794921875,
0.043212890625,
0.0546875,
0.050048828125,
0.01470947265625,
-0.006561279296875,
0.00958251953125,
-0.0164794921875,
-0.01177978515625,
-0.03564453125,
-0.009765625,
0.007598876953125,
-0.00133514404296875,
-0.0322265625,
0.0211181640625,
0.029296875,
0.009765625,
-0.00151824951171875,
-0.0019073486328125,
-0.00830078125,
-0.032958984375,
0.028564453125,
-0.0186767578125,
-0.03369140625,
0.00897216796875,
-0.0040283203125,
-0.01104736328125,
0.01434326171875,
0.05419921875,
0.012939453125,
-0.0341796875,
-0.002471923828125,
-0.01446533203125,
0.04150390625,
0.04541015625,
0.0203857421875,
0.01251220703125,
-0.01348876953125,
0.01312255859375,
0.0052490234375,
0.0419921875,
-0.01092529296875,
-0.022705078125,
-0.0252685546875,
0.00836181640625,
-0.03759765625,
-0.03076171875,
0.009033203125,
-0.017822265625,
-0.00982666015625,
0.04736328125,
0.025634765625,
0.01165771484375,
-0.032470703125,
-0.0267333984375,
0.03515625,
0.0012664794921875,
-0.061767578125,
-0.0089111328125,
0.050048828125,
0.01275634765625,
0.0250244140625,
0.0191650390625,
0.036376953125,
0.005828857421875,
-0.0047607421875,
0.0023956298828125,
-0.031494140625,
-0.003143310546875,
-0.004669189453125,
0.0194091796875,
0.00897216796875,
-0.01104736328125,
0.0849609375,
0.007232666015625,
-0.0201416015625,
-0.0260009765625,
-0.0172119140625,
0.08984375,
0.0517578125,
-0.0260009765625,
-0.0260009765625,
-0.01239013671875,
-0.0247802734375,
0.0164794921875,
-0.0225830078125,
0.018798828125,
-0.038330078125,
-0.0308837890625,
-0.024658203125,
-0.002227783203125,
0.003204345703125,
0.0185546875,
-0.04443359375,
-0.004302978515625,
0.03271484375,
-0.01507568359375,
-0.00080108642578125,
0.038818359375,
0.0040283203125,
0.007049560546875,
-0.0198974609375,
0.004364013671875,
-0.02392578125,
-0.029296875,
-0.0037994384765625,
-0.033447265625,
0.01348876953125,
0.058837890625,
0.00811767578125,
0.0341796875,
0.0064697265625,
0.025146484375,
-0.000492095947265625,
0.0289306640625,
-0.0074462890625,
0.0189208984375,
0.031494140625,
-0.00518798828125,
-0.0283203125,
-0.0011444091796875,
0.03857421875,
0.036376953125,
0.026611328125,
0.005615234375,
-0.0032958984375,
-0.025146484375,
-0.02197265625,
0.0194091796875,
0.0181884765625,
0.07275390625,
-0.0045166015625,
0.025634765625,
-0.0286865234375,
0.0103759765625,
-0.01141357421875,
-0.03564453125,
0.049560546875,
-0.03173828125,
-0.0269775390625,
0.02734375,
0.041015625,
-0.01953125,
-0.0003528594970703125,
-0.0133056640625,
0.0203857421875,
-0.011474609375,
0.0150146484375,
0.0001010894775390625,
-0.01165771484375,
-0.026611328125,
0.03564453125,
-0.012939453125,
0.00024127960205078125,
-0.01446533203125,
0.0322265625,
-0.03125,
-0.0093994140625,
0.00066375732421875,
-0.02197265625,
0.0115966796875,
-0.033935546875,
-0.045166015625,
0.0361328125,
0.017822265625,
0.0322265625,
-0.0263671875,
0.04345703125,
-0.0133056640625,
-0.0478515625,
0.0087890625,
0.0208740234375,
-0.014892578125,
0.0135498046875,
0.044921875,
-0.006011962890625,
-0.0042724609375,
-0.0242919921875,
0.025146484375,
-0.021728515625,
-0.033935546875,
-0.0198974609375,
0.0172119140625,
0.00958251953125,
0.01806640625,
0.04345703125,
-0.01104736328125,
0.036376953125,
0.00141143798828125,
-0.03466796875,
-0.0001964569091796875,
-0.035400390625,
0.0191650390625,
0.0031280517578125,
0.00848388671875,
0.03076171875,
-0.0277099609375,
0.01953125,
0.08447265625,
0.0169677734375,
-0.02783203125,
-0.045166015625,
0.0181884765625,
0.039794921875,
-0.023681640625,
-0.00701904296875,
0.040771484375,
-0.020751953125,
0.06689453125,
-0.048828125,
-0.00164031982421875,
0.006256103515625,
0.01007080078125,
0.00323486328125,
-0.03662109375,
0.0419921875,
0.0002651214599609375,
-0.0301513671875,
-0.0111083984375,
0.072265625,
0.10009765625,
0.03564453125,
-0.0032501220703125,
0.06103515625,
0.0103759765625,
0.011474609375,
-0.034912109375,
-0.0115966796875,
0.025634765625,
-0.062255859375,
-0.01416015625,
0.037841796875,
-0.006683349609375,
-0.04736328125,
0.04345703125,
0.0021514892578125,
-0.0311279296875,
0.00885009765625,
-0.0107421875,
0.00372314453125,
-0.021728515625,
-0.00020503997802734375,
0.000263214111328125,
-0.01446533203125,
0.0028228759765625,
0.0277099609375
]
question_embedding = json.dumps(question_embedding)
# res = search_question_embedding(book_id, question_embedding)
# print(res)
# 6、搜题 只搜索题目向量
print("\n6、搜题 搜索题目向量和题目ocr文本,联合打分")
book_id = "12667382"
question_embedding = [
0.00543212890625,
-0.005035400390625,
0.00714111328125,
0.06591796875,
-0.032958984375,
-0.0380859375,
-0.0255126953125,
-0.046630859375,
0.01031494140625,
0.036376953125,
0.013916015625,
0.0255126953125,
-0.0107421875,
-0.04345703125,
0.0008697509765625,
0.02587890625,
-0.0201416015625,
-0.02099609375,
-0.00830078125,
0.033203125,
-0.01007080078125,
0.0224609375,
-0.04931640625,
0.0206298828125,
0.009765625,
0.02392578125,
0.00095367431640625,
0.03369140625,
0.005035400390625,
0.07373046875,
0.0262451171875,
0.015625,
-0.021728515625,
0.0054931640625,
0.0308837890625,
0.055419921875,
-0.0517578125,
0.011474609375,
-0.023681640625,
0.0235595703125,
-0.00128173828125,
0.0096435546875,
0.003936767578125,
-0.01953125,
-0.05126953125,
0.001617431640625,
-0.005218505859375,
-0.01409912109375,
-0.026611328125,
0.0198974609375,
0.02001953125,
0.004974365234375,
0.053955078125,
-0.0634765625,
-0.0654296875,
-0.03369140625,
-0.08154296875,
0.06005859375,
0.0223388671875,
-0.044921875,
-0.0283203125,
0.01483154296875,
0.150390625,
0.026611328125,
0.02392578125,
-0.00125885009765625,
-0.053955078125,
-0.03759765625,
-0.0390625,
0.02734375,
-0.01385498046875,
0.00140380859375,
0.0218505859375,
-0.01025390625,
-0.0205078125,
0.04638671875,
0.0145263671875,
-0.00927734375,
0.01025390625,
-0.0225830078125,
-0.00909423828125,
-0.0142822265625,
0.02978515625,
0.0101318359375,
-0.072265625,
-0.049560546875,
0.0311279296875,
0.0101318359375,
-0.00445556640625,
-0.03369140625,
0.0615234375,
-0.021240234375,
-0.034912109375,
-0.004150390625,
0.07177734375,
0.0262451171875,
-0.013671875,
0.03369140625,
0.021728515625,
0.0252685546875,
-0.00982666015625,
-0.0028228759765625,
-0.00151824951171875,
0.003753662109375,
0.004791259765625,
-0.03759765625,
0.025146484375,
0.0235595703125,
-0.0103759765625,
0.0084228515625,
-0.0191650390625,
0.044921875,
0.052001953125,
-0.053955078125,
0.0206298828125,
0.036376953125,
0.01202392578125,
0.0196533203125,
0.025146484375,
-0.000457763671875,
-0.0033721923828125,
-0.0101318359375,
0.005035400390625,
0.0089111328125,
0.002227783203125,
-0.041015625,
0.00927734375,
0.10693359375,
0.01043701171875,
0.00616455078125,
0.013916015625,
0.0035858154296875,
-0.0283203125,
-0.06591796875,
0.016845703125,
-0.0615234375,
-0.01385498046875,
-0.05029296875,
-0.01556396484375,
-0.0128173828125,
-0.0140380859375,
0.00732421875,
-0.0164794921875,
0.01239013671875,
0.0150146484375,
-0.0169677734375,
-0.142578125,
-0.0008087158203125,
-0.007171630859375,
0.0033416748046875,
0.03662109375,
-0.00341796875,
-0.0198974609375,
-0.03955078125,
-0.06640625,
-0.05712890625,
0.0133056640625,
-0.0277099609375,
-0.005462646484375,
-0.0224609375,
0.002166748046875,
0.0296630859375,
-0.030517578125,
-0.029052734375,
0.030517578125,
-0.0162353515625,
-0.0252685546875,
-0.0174560546875,
-0.023193359375,
0.005340576171875,
-0.026611328125,
-0.006011962890625,
-0.01611328125,
-0.033203125,
-0.000820159912109375,
0.04248046875,
-0.0301513671875,
0.019287109375,
0.00909423828125,
0.0400390625,
0.03564453125,
0.030517578125,
0.0140380859375,
-0.0014495849609375,
0.01251220703125,
-0.0556640625,
0.04248046875,
-0.056640625,
-0.030517578125,
-0.009033203125,
-0.01165771484375,
-0.0277099609375,
0.0087890625,
0.024658203125,
-0.043701171875,
0.0146484375,
-0.020751953125,
0.01007080078125,
0.0164794921875,
0.06591796875,
-0.06298828125,
0.0303955078125,
-0.020263671875,
-0.007171630859375,
0.004425048828125,
0.0020751953125,
-0.01373291015625,
0.050537109375,
-0.08203125,
-0.036865234375,
0.019775390625,
-0.0050048828125,
0.06640625,
-0.00823974609375,
0.04248046875,
-0.02734375,
0.004425048828125,
0.006500244140625,
-0.00537109375,
-0.0546875,
0.01470947265625,
0.0089111328125,
0.0025787353515625,
-0.00860595703125,
0.018310546875,
-0.0208740234375,
-0.06689453125,
-0.00592041015625,
-0.0303955078125,
-0.0021514892578125,
0.00372314453125,
0.0218505859375,
-0.0301513671875,
0.02587890625,
-0.01239013671875,
0.04541015625,
0.03662109375,
0.07421875,
0.0225830078125,
0.000385284423828125,
0.06396484375,
-0.0020294189453125,
0.008056640625,
0.0111083984375,
0.0081787109375,
0.033447265625,
-0.01544189453125,
0.09912109375,
-0.00994873046875,
0.0208740234375,
-0.044921875,
-0.017333984375,
0.0211181640625,
0.08056640625,
-0.02294921875,
0.062255859375,
0.01708984375,
-0.011474609375,
0.03125,
0.018798828125,
0.0019683837890625,
0.0390625,
-0.0052490234375,
-0.01312255859375,
-0.002044677734375,
-0.046630859375,
0.015625,
0.037353515625,
0.002777099609375,
-0.00811767578125,
0.0233154296875,
-0.007232666015625,
0.039794921875,
0.009765625,
0.01708984375,
0.043212890625,
0.0101318359375,
0.02783203125,
-0.021484375,
-0.01470947265625,
-0.013671875,
0.01470947265625,
-0.032470703125,
-0.0185546875,
-0.00506591796875,
0.03125,
-0.03125,
0.048095703125,
-0.01165771484375,
0.0296630859375,
0.017822265625,
-0.011474609375,
0.00787353515625,
0.005645751953125,
-0.0093994140625,
0.0380859375,
-0.017333984375,
0.0255126953125,
-0.01806640625,
0.033447265625,
-0.001739501953125,
-0.0191650390625,
0.0054931640625,
0.053466796875,
-0.0308837890625,
-0.006072998046875,
0.0179443359375,
0.025634765625,
-0.01318359375,
0.01214599609375,
-0.0040283203125,
0.03125,
0.047119140625,
0.056640625,
0.01116943359375,
-0.0235595703125,
0.00872802734375,
0.003936767578125,
-0.000652313232421875,
-0.004638671875,
-0.00341796875,
0.0035400390625,
0.0322265625,
0.0169677734375,
0.008056640625,
-0.0262451171875,
-0.080078125,
-0.04638671875,
0.017822265625,
-0.01544189453125,
-0.0296630859375,
0.033203125,
0.10693359375,
0.038330078125,
0.00286865234375,
0.04541015625,
0.021484375,
-0.04931640625,
0.01953125,
-0.035400390625,
-0.07958984375,
-0.07177734375,
-0.004119873046875,
0.0208740234375,
0.10498046875,
-0.006011962890625,
-0.04248046875,
-0.045654296875,
-0.001434326171875,
0.00994873046875,
0.0164794921875,
0.0277099609375,
0.006378173828125,
-0.0286865234375,
0.0191650390625,
-0.02587890625,
0.011474609375,
0.0703125,
0.00732421875,
-0.032470703125,
-0.0211181640625,
-0.05615234375,
0.01092529296875,
0.031494140625,
0.0247802734375,
-0.00372314453125,
-0.0013427734375,
-0.06982421875,
-0.00714111328125,
0.0196533203125,
0.017333984375,
0.01336669921875,
-0.00188446044921875,
-0.01019287109375,
-0.00836181640625,
0.035400390625,
-0.0400390625,
0.01177978515625,
0.009033203125,
0.004425048828125,
0.0157470703125,
0.0015716552734375,
0.007354736328125,
-0.037841796875,
0.01214599609375,
-0.002685546875,
-0.0233154296875,
0.0107421875,
-0.052001953125,
-0.057373046875,
0.007354736328125,
0.007171630859375,
-0.02734375,
-0.005523681640625,
-0.0010528564453125,
-0.0106201171875,
0.0022125244140625,
0.08642578125,
-0.033447265625,
0.03125,
-0.0225830078125,
-0.03125,
0.007049560546875,
-0.0712890625,
-0.0023956298828125,
0.006317138671875,
-0.056640625,
0.0206298828125,
-0.017333984375,
0.048828125,
0.0419921875,
-0.0037078857421875,
0.02978515625,
0.0159912109375,
0.04443359375,
-0.060302734375,
-0.038330078125,
0.0277099609375,
0.036865234375,
0.0322265625,
-0.006011962890625,
-0.022705078125,
-0.050537109375,
0.041748046875,
0.01507568359375,
0.007232666015625,
0.050048828125,
0.041015625,
-0.0240478515625,
0.006561279296875,
0.000301361083984375,
-0.01446533203125,
0.01507568359375,
-0.0556640625,
-0.02734375,
-0.0081787109375,
0.04736328125,
0.0025787353515625,
-0.0283203125,
0.01129150390625,
-0.0252685546875,
0.0225830078125,
0.00421142578125,
-0.004791259765625,
0.005401611328125,
0.006011962890625,
-0.057861328125,
-0.0289306640625,
0.008544921875,
-0.0115966796875,
0.022705078125,
-0.008056640625,
-0.01416015625,
0.0021209716796875,
-0.0038909912109375,
0.01025390625,
-0.0247802734375,
0.01275634765625,
0.031494140625,
-0.00015926361083984375,
-0.004119873046875,
0.041259765625,
-0.040283203125,
0.052734375,
-0.052734375,
-0.0225830078125,
-0.06884765625,
-0.07421875,
-0.0341796875,
0.0196533203125,
0.0038604736328125,
-0.041015625,
-0.005615234375,
-0.039794921875,
-0.0205078125,
0.04931640625,
-0.0289306640625,
0.006561279296875,
0.020751953125,
-0.03369140625,
0.020263671875,
-0.039794921875,
0.0081787109375,
-0.0020294189453125,
0.0218505859375,
0.01611328125,
-0.0281982421875,
-0.027099609375,
0.00131988525390625,
-0.0654296875,
0.023681640625,
-0.006256103515625,
-0.0211181640625,
-0.006103515625,
-0.034912109375,
-0.00299072265625,
-0.0059814453125,
-0.036376953125,
-0.0174560546875,
-0.056640625,
0.000316619873046875,
-0.033935546875,
0.00726318359375,
-0.033447265625,
0.06591796875,
0.0018768310546875,
0.02734375,
-0.04345703125,
0.032958984375,
-0.01953125,
-0.0034332275390625,
0.01300048828125,
0.000713348388671875,
-0.008056640625,
-0.004730224609375,
-0.0341796875,
0.030517578125,
-0.0255126953125,
0.0286865234375,
-0.04638671875,
0.025146484375,
-0.03515625,
0.001220703125,
-0.01068115234375,
0.043212890625,
-0.01416015625,
-0.0223388671875,
-0.03369140625,
-0.0059814453125,
0.007049560546875,
0.0220947265625,
-0.0281982421875,
-0.00811767578125,
-0.00799560546875,
0.0263671875,
0.01373291015625,
0.00194549560546875,
0.20703125,
-0.0076904296875,
0.06591796875,
-0.0059814453125,
0.01043701171875,
-0.01251220703125,
0.038330078125,
-0.0087890625,
0.09765625,
0.003082275390625,
0.0096435546875,
0.000152587890625,
0.02392578125,
-0.04638671875,
-0.007781982421875,
-0.0030975341796875,
-0.00098419189453125,
-0.0181884765625,
-0.007232666015625,
-0.0235595703125,
0.033447265625,
0.0125732421875,
-0.00171661376953125,
0.022216796875,
-0.000278472900390625,
-0.02392578125,
0.0294189453125,
-0.0128173828125,
0.039794921875,
-0.0247802734375,
-0.049560546875,
0.003631591796875,
0.0206298828125,
0.0164794921875,
0.028564453125,
-0.01251220703125,
0.02294921875,
-0.00099945068359375,
-0.022216796875,
-0.050537109375,
0.001861572265625,
-0.0191650390625,
-0.045654296875,
-0.023681640625,
0.004119873046875,
0.050048828125,
-0.007232666015625,
0.05029296875,
-0.027099609375,
-0.0169677734375,
0.035888671875,
-0.02001953125,
-0.0040283203125,
0.01043701171875,
0.0035400390625,
-0.0059814453125,
0.0263671875,
-0.01220703125,
-0.016357421875,
0.01483154296875,
-0.0390625,
-0.00396728515625,
0.04345703125,
0.023193359375,
-0.0172119140625,
0.021240234375,
-0.04736328125,
0.006866455078125,
0.036376953125,
-0.0289306640625,
0.0208740234375,
0.011474609375,
-0.0260009765625,
0.01904296875,
0.016357421875,
-0.04931640625,
-0.0478515625,
0.0213623046875,
-0.01275634765625,
-0.032470703125,
-0.0084228515625,
-0.001922607421875,
0.0191650390625,
0.008544921875,
-0.037353515625,
0.0111083984375,
-0.022705078125,
-0.04052734375,
0.033447265625,
-0.01116943359375,
-0.03369140625,
-0.0167236328125,
-0.0303955078125,
0.0191650390625,
-0.06591796875,
0.00433349609375,
-0.078125,
-0.00732421875,
0.04150390625,
0.00286865234375,
0.0157470703125,
-0.00008440017700195312,
0.041015625,
-0.034423828125,
0.053466796875,
0.005218505859375,
0.01239013671875,
-0.006072998046875,
-0.00628662109375,
0.018798828125,
0.00151824951171875,
0.01165771484375,
0.0164794921875,
0.0089111328125,
0.0247802734375,
0.024658203125,
-0.0031585693359375,
-0.0181884765625,
-0.0103759765625,
-0.04541015625,
0.00323486328125,
0.0157470703125,
-0.004791259765625,
0.0308837890625,
-0.03076171875,
0.00506591796875,
0.02392578125,
-0.0179443359375,
0.0269775390625,
-0.003509521484375,
-0.0181884765625,
0.020263671875,
0.03076171875,
0.00145721435546875,
-0.004547119140625,
0.0167236328125,
0.035400390625,
-0.0206298828125,
-0.0810546875,
-0.0113525390625,
-0.02294921875,
0.0022735595703125,
0.0250244140625,
-0.006561279296875,
0.0223388671875,
0.043212890625,
0.03564453125,
-0.009765625,
0.036376953125,
-0.0028228759765625,
-0.003204345703125,
-0.041748046875,
-0.001495361328125,
-0.05419921875,
-0.04248046875,
-0.01708984375,
-0.002777099609375,
-0.01251220703125,
-0.001739501953125,
-0.0084228515625,
0.00130462646484375,
-0.000579833984375,
-0.0262451171875,
0.01446533203125,
0.037841796875,
0.02001953125,
-0.0157470703125,
-0.017333984375,
0.006317138671875,
0.0125732421875,
-0.0206298828125,
0.02197265625,
0.012939453125,
-0.0162353515625,
-0.068359375,
0.0250244140625,
0.006317138671875,
0.00201416015625,
-0.005126953125,
-0.00872802734375,
-0.06591796875,
0.0233154296875,
-0.01806640625,
-0.01312255859375,
-0.042236328125,
-0.03125,
-0.0198974609375,
-0.00193023681640625,
-0.004364013671875,
-0.0262451171875,
-0.0439453125,
-0.020263671875,
-0.01043701171875,
-0.0250244140625,
0.036865234375,
0.00946044921875,
-0.012939453125,
0.0213623046875,
-0.032470703125,
-0.048095703125,
0.020263671875,
0.00482177734375,
0.048583984375,
0.0301513671875,
-0.046630859375,
-0.021728515625,
-0.00408935546875,
0.0164794921875,
-0.006591796875,
-0.01409912109375,
0.0439453125,
-0.003936767578125,
-0.053955078125,
-0.0123291015625,
-0.025146484375,
-0.0185546875,
-0.061767578125,
0.0045166015625,
-0.01348876953125,
-0.025634765625,
0.0013580322265625,
-0.033447265625,
-0.0093994140625,
-0.039794921875,
0.04638671875,
-0.00799560546875,
-0.03955078125,
-0.015869140625,
0.0322265625,
0.0093994140625,
0.02978515625,
0.005828857421875,
-0.00011873245239257812,
-0.003936767578125,
0.0274658203125,
0.00701904296875,
0.000858306884765625,
-0.0146484375,
0.020263671875,
0.04150390625,
0.0216064453125,
-0.0164794921875,
0.043212890625,
0.0546875,
0.050048828125,
0.01470947265625,
-0.006561279296875,
0.00958251953125,
-0.0164794921875,
-0.01177978515625,
-0.03564453125,
-0.009765625,
0.007598876953125,
-0.00133514404296875,
-0.0322265625,
0.0211181640625,
0.029296875,
0.009765625,
-0.00151824951171875,
-0.0019073486328125,
-0.00830078125,
-0.032958984375,
0.028564453125,
-0.0186767578125,
-0.03369140625,
0.00897216796875,
-0.0040283203125,
-0.01104736328125,
0.01434326171875,
0.05419921875,
0.012939453125,
-0.0341796875,
-0.002471923828125,
-0.01446533203125,
0.04150390625,
0.04541015625,
0.0203857421875,
0.01251220703125,
-0.01348876953125,
0.01312255859375,
0.0052490234375,
0.0419921875,
-0.01092529296875,
-0.022705078125,
-0.0252685546875,
0.00836181640625,
-0.03759765625,
-0.03076171875,
0.009033203125,
-0.017822265625,
-0.00982666015625,
0.04736328125,
0.025634765625,
0.01165771484375,
-0.032470703125,
-0.0267333984375,
0.03515625,
0.0012664794921875,
-0.061767578125,
-0.0089111328125,
0.050048828125,
0.01275634765625,
0.0250244140625,
0.0191650390625,
0.036376953125,
0.005828857421875,
-0.0047607421875,
0.0023956298828125,
-0.031494140625,
-0.003143310546875,
-0.004669189453125,
0.0194091796875,
0.00897216796875,
-0.01104736328125,
0.0849609375,
0.007232666015625,
-0.0201416015625,
-0.0260009765625,
-0.0172119140625,
0.08984375,
0.0517578125,
-0.0260009765625,
-0.0260009765625,
-0.01239013671875,
-0.0247802734375,
0.0164794921875,
-0.0225830078125,
0.018798828125,
-0.038330078125,
-0.0308837890625,
-0.024658203125,
-0.002227783203125,
0.003204345703125,
0.0185546875,
-0.04443359375,
-0.004302978515625,
0.03271484375,
-0.01507568359375,
-0.00080108642578125,
0.038818359375,
0.0040283203125,
0.007049560546875,
-0.0198974609375,
0.004364013671875,
-0.02392578125,
-0.029296875,
-0.0037994384765625,
-0.033447265625,
0.01348876953125,
0.058837890625,
0.00811767578125,
0.0341796875,
0.0064697265625,
0.025146484375,
-0.000492095947265625,
0.0289306640625,
-0.0074462890625,
0.0189208984375,
0.031494140625,
-0.00518798828125,
-0.0283203125,
-0.0011444091796875,
0.03857421875,
0.036376953125,
0.026611328125,
0.005615234375,
-0.0032958984375,
-0.025146484375,
-0.02197265625,
0.0194091796875,
0.0181884765625,
0.07275390625,
-0.0045166015625,
0.025634765625,
-0.0286865234375,
0.0103759765625,
-0.01141357421875,
-0.03564453125,
0.049560546875,
-0.03173828125,
-0.0269775390625,
0.02734375,
0.041015625,
-0.01953125,
-0.0003528594970703125,
-0.0133056640625,
0.0203857421875,
-0.011474609375,
0.0150146484375,
0.0001010894775390625,
-0.01165771484375,
-0.026611328125,
0.03564453125,
-0.012939453125,
0.00024127960205078125,
-0.01446533203125,
0.0322265625,
-0.03125,
-0.0093994140625,
0.00066375732421875,
-0.02197265625,
0.0115966796875,
-0.033935546875,
-0.045166015625,
0.0361328125,
0.017822265625,
0.0322265625,
-0.0263671875,
0.04345703125,
-0.0133056640625,
-0.0478515625,
0.0087890625,
0.0208740234375,
-0.014892578125,
0.0135498046875,
0.044921875,
-0.006011962890625,
-0.0042724609375,
-0.0242919921875,
0.025146484375,
-0.021728515625,
-0.033935546875,
-0.0198974609375,
0.0172119140625,
0.00958251953125,
0.01806640625,
0.04345703125,
-0.01104736328125,
0.036376953125,
0.00141143798828125,
-0.03466796875,
-0.0001964569091796875,
-0.035400390625,
0.0191650390625,
0.0031280517578125,
0.00848388671875,
0.03076171875,
-0.0277099609375,
0.01953125,
0.08447265625,
0.0169677734375,
-0.02783203125,
-0.045166015625,
0.0181884765625,
0.039794921875,
-0.023681640625,
-0.00701904296875,
0.040771484375,
-0.020751953125,
0.06689453125,
-0.048828125,
-0.00164031982421875,
0.006256103515625,
0.01007080078125,
0.00323486328125,
-0.03662109375,
0.0419921875,
0.0002651214599609375,
-0.0301513671875,
-0.0111083984375,
0.072265625,
0.10009765625,
0.03564453125,
-0.0032501220703125,
0.06103515625,
0.0103759765625,
0.011474609375,
-0.034912109375,
-0.0115966796875,
0.025634765625,
-0.062255859375,
-0.01416015625,
0.037841796875,
-0.006683349609375,
-0.04736328125,
0.04345703125,
0.0021514892578125,
-0.0311279296875,
0.00885009765625,
-0.0107421875,
0.00372314453125,
-0.021728515625,
-0.00020503997802734375,
0.000263214111328125,
-0.01446533203125,
0.0028228759765625,
0.0277099609375
]
question_embedding = json.dumps(question_embedding)
question_ocr = "物理"
# res = search_question_text_and_embedding(book_id, question_ocr, question_embedding)
# print(res)
print("\nfinished.")
# !/usr/bin/env python
# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import numpy as np
import json
# 连接 Elasticsearch
es = Elasticsearch(
hosts = ["http://192.168.8.224:19200"], # 王磊搭建的新的es索引
# basic_auth=("elastic", "password") # 替换为你的认证信息
)
# 创建索引(如果不存在)
def create_index(index_name):
if not es.indices.exists(index=index_name):
index_settings = {
"mappings": {
"properties": {
"image_url": {"type": "keyword"},
"image_description": {"type": "text"},
"image_vector": {
"type": "dense_vector",
"dims": 2048, # 向量维度,根据你的向量模型调整
"index": True,
"similarity": "cosine"
}
}
}
}
es.indices.create(index=index_name, body=index_settings)
print(f"索引 {index_name} 创建成功")
# 插入单条图片数据
def insert_single_image(index_name, image_url, image_description, image_vector):
doc = {
"image_url": image_url,
"image_description": image_description,
"image_vector": image_vector
}
es.index(index=index_name, document=doc)
print(f"图片 {image_url} 已插入")
# 批量插入图片数据
def insert_bulk_images(index_name, image_data_list):
actions = []
for data in image_data_list:
action = {
"_index": index_name,
"_source": {
"image_url": data["image_url"],
"image_description": data["image_description"],
"image_vector": data["image_vector"]
}
}
actions.append(action)
bulk(es, actions)
print(f"批量插入 {len(actions)} 条数据成功")
# 文本检索
def text_search(index_name, query_text, size=10):
query = {
"match": {
"image_description": query_text
}
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# 向量检索
def vector_search(index_name, query_vector, size=10):
query = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'image_vector') + 1.0",
"params": {"query_vector": query_vector}
}
}
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# 向量-文本联合检索
def hybrid_search(index_name, query_text, query_vector, text_weight=0.5, vector_weight=0.5, size=10):
query = {
"bool": {
"should": [
{
"match": {
"image_description": {
"query": query_text,
"boost": text_weight
}
}
},
{
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'image_vector') * params.vector_weight",
"params": {
"query_vector": query_vector,
"vector_weight": vector_weight
}
}
}
}
]
},
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# RRF融合函数
def rrf_fusion(text_results, vector_results, k=60):
"""
使用RRF算法融合两种检索结果
参数:
text_results: es文本检索结果
vector_results: es向量检索结果
k: RRF算法中的rank常数,一般取60
返回:
融合后的结果列表
"""
# 构建文档ID到文本检索排名的映射
text_rank_map = {hit['_id']: rank + 1 for rank, hit in enumerate(text_results)}
# 构建文档ID到向量检索排名的映射
vector_rank_map = {hit['_id']: rank + 1 for rank, hit in enumerate(vector_results)}
# 合并所有文档ID
all_doc_ids = set(text_rank_map.keys()).union(set(vector_rank_map.keys()))
# 计算RRF分数
rrf_scores = {}
for doc_id in all_doc_ids:
text_rank = text_rank_map.get(doc_id, float('inf'))
vector_rank = vector_rank_map.get(doc_id, float('inf'))
# RRF公式: 1/(k + rank)
text_score = 1 / (k + text_rank) if text_rank != float('inf') else 0
vector_score = 1 / (k + vector_rank) if vector_rank != float('inf') else 0
rrf_scores[doc_id] = text_score + vector_score
# 按RRF分数排序
sorted_doc_ids = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
# 构建最终结果
final_results = []
# 优先从文本结果中获取完整信息
id_to_hit = {hit['_id']: hit for hit in text_results}
# 补充向量结果中的文档
for hit in vector_results:
if hit['_id'] not in id_to_hit:
id_to_hit[hit['_id']] = hit
for doc_id, score in sorted_doc_ids:
if doc_id in id_to_hit:
result = {
'_id': doc_id,
'_score': score,
'_source': id_to_hit[doc_id]['_source']
}
final_results.append(result)
return final_results
# RRF向量-文本联合检索
def hybrid_search_rrf(index_name, query_text, query_vector, size=10, rrf_k=60):
"""使用RRF融合文本和向量检索结果"""
# 执行文本和向量检索
text_hits = text_search(index_name, query_text)
vector_hits = vector_search(index_name, query_vector)
# 使用RRF融合结果
fused_results = rrf_fusion(text_hits, vector_hits, k=rrf_k)
# 返回前size个结果
return fused_results[:size]
# 示例用法
if __name__ == "__main__":
INDEX_NAME = "tst_images_text"
# # 1、创建索引
create_index(INDEX_NAME)
# 示例数据
sample_images = [
{
"image_url": "https://oss.5rs.me/oss/upload/image/jpeg/b2f3fbaa98a3f2552f2dba7451decf7b_36_20250605135114454.jpeg",
"image_description": "、填一填。培优作业18 80 78 2.比 16 少9的数是( )多8的数是 17;27 );比( 与 30的和是( );47比3多( )个十和( )个一;7个十和2个一 3.83 里面有( 合起来是( ),它们 4.最大的两位数是(),最小的三位数是(相差(5.写一写,比一比。百位十位个位百位十位个位百位十位个位选择。(把正确答案的序号填在括号里)1.根据艾蒿叶子的影子判断生长时间最短的叶子是(2.在远古时代,为了记录猎物的多少,人们常用石子计数。假设用0表示10.用一表示1,那么下面表示的数最大的是",
"image_vector": np.random.random(2048).tolist() # 替换为实际的图片向量
},
{
"image_url": "https://oss.5rs.me/oss/upload/image/jpeg/9579d101ec560c85f204fa11b7a792c0_2_20250605094300322.jpeg",
"image_description": "、直接写得数。培优作业1*18 x10=50 x90 =18 x41~51 x22~30x40 =12 x70 =32 x49 ~17 x78 ~20 x15 =44 x20 =78 x11~46 x23 ~二、填一填。1.13的30倍是(),10个22的和是()位数,积大约是(2.49 x28 的积是()。3.25x40的积的末尾有()个0。4.小亮平均每分钟走69米,他从家走到学校用了12分钟。小亮家离学校大约有()米。5.晶晶平均每分钟可以写25个字,照这样的速度,她15分钟可以写)个字。三、用竖式计算,并验算。18 x51 =32 x60 =46 x25 =四、在( )里填上“ >”“<”或“=”30 x30( )100070x12()80034 x56)36 x5415 x40( )60011 x44(訕興癪9 x13)450)13 x8921 x3425 x801圧ʌ帳八廈仈 x30)600)200023 x45",
"image_vector": np.random.random(2048).tolist() # 替换为实际的图片向量
}
]
# # 2、批量插入
# insert_bulk_images(INDEX_NAME, sample_images)
# # 3、文本检索示例
print("\n3、文本检索结果:")
text_results = text_search(INDEX_NAME, "填一填")
for hit in text_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 4、向量检索示例
print("\n4、向量检索结果:")
vector_results = vector_search(INDEX_NAME, sample_images[0]["image_vector"])
for hit in vector_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 5、联合检索示例
print("\n5、联合检索结果:")
hybrid_results = hybrid_search(
INDEX_NAME,
"可爱的动物",
sample_images[0]["image_vector"],
text_weight=0.4,
vector_weight=0.6
)
for hit in hybrid_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 6、RRF联合检索示例
print("\n6、RRF联合检索结果:")
hybrid_results = hybrid_search_rrf(
INDEX_NAME,
"填一填。培优作业18 80 78 2.比 16 少9的数是( )多8的数是 17;27 );比( 与 30的和是( );47比3多( )个十和( )个一;7个十和2个",
sample_images[0]["image_vector"],
size=10,
rrf_k=60
)
for hit in hybrid_results:
print(f"得分: {hit['_score']:.4f}, \n描述: {hit['_source']['image_description']}")
print()
print("\nfinished.")
from tritonclient import grpc as triton_grpc
import numpy as np
import json
from PIL import Image
import io
import base64
import time
INPUT_NAME = "input"
OUTPUT_NAME = "output"
def _create_triton_input(data):
data = json.dumps(data, separators=(",", ":"))
data = data.encode("utf-8")
data = [[data]]
data = np.array(data, dtype=np.object_)
return data
def _parse_triton_output(data):
data = data[0, 0]
data = data.decode("utf-8")
data = json.loads(data)
return data
def triton_request(client, data, *, request_kwargs=None):
if request_kwargs is None:
request_kwargs = {}
input_ = triton_grpc.InferInput(INPUT_NAME, [1, 1], "BYTES")
input_.set_data_from_numpy(_create_triton_input(data))
results = client.infer('ocr', inputs=[input_], **request_kwargs)
output = results.as_numpy(OUTPUT_NAME)
return _parse_triton_output(output)
def infer_paddle_ocr(im:Image.Image, client):
buffer = io.BytesIO()
im.save(buffer, format='JPEG')
buffer.seek(0)
base64_file = base64.b64encode(buffer.read()).decode("ascii")
input_ = {"file": base64_file,
"fileType": 1}
while True:
output_ocr = triton_request(client, input_)
errorCode = output_ocr["errorCode"]
if errorCode == 0:
result = output_ocr["result"]
res_ocr = result["ocrResults"][0]['prunedResult']
db_boxes = res_ocr['rec_boxes']
rec_texts = res_ocr['rec_texts']
break
return db_boxes, rec_texts
if __name__=='__main__':
start_time = time.time()
paddle_ocr_url = '61.170.32.8:38896'
client_ocr = triton_grpc.InferenceServerClient(paddle_ocr_url)
im = '/data/liyaze/doc2xkey_server_v2/utils/my-obs-object-key-demo.jpg'
im = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册/1.jpg"
im = "/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册/93.jpg"
im = Image.open(im).convert('RGB')
db_boxes, rec_texts = infer_paddle_ocr(im, client_ocr)
print(f"耗时:{time.time()-start_time}")
print(rec_texts)
\ No newline at end of file
import re,json,sys,os
import pandas as pd
import numpy as np
class prepareData():
def __init__(self):
self.index = "index" # int
self.fileId = "bankId" # int
self.question_image_url = "question_image_url"
self.question_text = "question_text"
self.answer_image_url = "answer_image_url"
self.answer_text = "answer_text"
self.question_chapter = "question_chapter"
self.question_category = "question_category"
self.page_id = "page_id" # int
self.book_page_id = "book_page_id"
self.knowledge_point = "knowledge_point"
self.knowledge_point_code = "knowledge_point_code"
self.difficult = "difficult"
self.resolve = "resolve"
self.appendix = "appendix"
self.source_image_url = "source_image_url"
self.page_ocr = "page_ocr"
self.question_ocr = 'question_ocr'
self.title2tile = {
"序号": self.index,
"题目id": self.fileId,
"题目图片地址": self.question_image_url,
"题目文本": self.question_text,
"答案图片地址": self.answer_image_url,
"答案文本": self.answer_text,
"所属章节": self.question_chapter,
"题目类型": self.question_category,
"电子页码": self.page_id,
"纸书页码": self.book_page_id,
"知识点": self.knowledge_point,
"知识点代码": self.knowledge_point_code,
"难度": self.difficult,
"解析": self.resolve,
"附件": self.appendix,
"source_image_url": self.source_image_url,
"page_ocr": self.page_ocr,
"question_ocr": self.question_ocr,
}
def clean(self, text):
text = re.sub(r'<img[^>]*>', '', text)
text = re.sub(r"</p><p>", " ", text)
text = re.sub(r"<p>", " ", text)
text = re.sub(r"</p>", " ", text)
text = text.strip()
return text
def read_excel(self,excel_path, sheet_name="Sheet1", book_id=None, book_name=None):
data = pd.read_excel(excel_path, sheet_name)
data = data.rename(columns=self.title2tile)
data.fillna("")
data = data.to_dict(orient='records')
page_id2bank_id = {} # 整页的所有题目
for item in data:
item[self.index] = int(item[self.index])
item[self.fileId] = int(item[self.fileId])
item[self.question_image_url] = str(item[self.question_image_url])
item[self.question_text] = str(item[self.question_text])
item[self.answer_image_url] = str(item[self.answer_image_url])
item[self.answer_text] = str(item[self.answer_text])
item[self.question_chapter] = str(item[self.question_chapter])
item[self.question_category] = str(item[self.question_category])
page_id = int(item[self.page_id])
item[self.page_id] = int(item[self.page_id])
item[self.book_page_id] = str(item[self.book_page_id])
item[self.knowledge_point] = str(item[self.knowledge_point])
item[self.knowledge_point_code] = str(item[self.knowledge_point_code])
item[self.difficult] = str(item[self.difficult])
item[self.resolve] = str(item[self.resolve])
item[self.appendix] = str(item[self.appendix])
item['book_id'] = str(book_id)
item['book_name'] = book_name
item['book_id_page_id']=str(book_id)+"-"+str(page_id)
item.pop('question_category')
item.pop('appendix')
# 1、整页图片的paddleOCR结果
# page_ocr = json.loads(item[self.page_ocr])['rec_texts'] # paddleOCR
# page_ocr = " ".join(page_ocr)
page_ocr = item[self.page_ocr] # 合合OCR
item[self.page_ocr] = page_ocr
# 2、题目的paddleOCR结果
# question_ocr = json.loads(item[self.question_ocr])['rec_texts'] # paddleOCR
# question_ocr = " ".join(question_ocr)
question_ocr = item[self.question_ocr] # 合合OCR
item[self.question_ocr] = question_ocr
# 3、整页图片所有题目id
bank_id = int(item[self.fileId])
if page_id in page_id2bank_id:
page_id2bank_id[page_id].append(bank_id)
else:
page_id2bank_id[page_id] = [bank_id]
# 4、原始题目 图片 embedding
if "question_image_embedding" in item.keys():
item['question_image_embedding'] = json.loads(item['question_image_embedding'])
# 5、整页 图片 embedding
if "page_image_embedding" in item.keys():
item['page_image_embedding'] = json.loads(item['page_image_embedding'])
for item in data:
page_id = int(item[self.page_id])
bank_ids = page_id2bank_id[page_id] # [12,2134,321]
item['page_id2bank_ids'] = bank_ids # 整页图片所有题目id
return data
def write2json(self, path, data_list):
with open(path, "w", encoding='utf-8') as f:
data_list = json.dumps(data_list, ensure_ascii=False, indent=4)
f.write(data_list)
print(f"success save: {path} ")
def result(self):
excel_file = "/home/liuxin/work/search_question/data/12671977-暑假生活·学期总复习八年级物理通用版6-原图/12671977-暑假生活·学期总复习八年级物理通用版6.xlsx"
sheet_name = "题库数据"
book_id = 12671977
book_name = '暑假生活·学期总复习八年级物理通用版6'
data_list = self.read_excel( excel_file, sheet_name=sheet_name, book_id=book_id, book_name=book_name)
save_json_path = excel_file+".json"
self.write2json(save_json_path, data_list)
if __name__ == "__main__":
prepare_data = prepareData()
prepare_data.result()
print("\nfinished.")
# !/usr/bin/env python
# -*- coding: utf-8 -*-
import copy
import asyncio
import json,re,sys,os
from elasticsearch import Elasticsearch, helpers # elasticsearch-8.11.0
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
# 将拆题后的数据写入es库
class EsHelper:
def __init__(self, hosts=None):
"""初始化ES连接
Args:
hosts: ES服务器地址列表,默认为None使用已配置的地址
"""
if hosts is None:
self.hosts = [{'host': '116.63.110.220', 'port': 9200}]
self.hosts = [{'host': 'localhost', 'port': 9200}]
self.hosts = ['http://localhost:9200'] # http://116.63.110.220:9200
self.hosts = ["http://192.168.8.224:19200"] # 王磊搭建的新的es索引
self.es = Elasticsearch(hosts=self.hosts) # , timeout=1000
print("self.es.ping(): ", self.es.ping())
def create_index(self, index_name):
"""创建索引
Args:
index_name: es索引名称
Returns:
创建结果
"""
body = {}
settings = {
"index": {
"number_of_shards": 1, # 定义索引的分片数量 方便在不同的节点部署
"number_of_replicas": 0, # 定义索引中每个主分片的副本数量
}}
mappings = {
"dynamic": True,
"properties": {
"book_id": {"type": "keyword"},
"book_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
},
"book_id_page_id": { "type": "keyword" },
"bankId": {
"type": "integer",
# "description": "题库ID"
},
"page_id2bank_ids": {
"type": "integer" # 对于整数列表,使用integer类型
# Elasticsearch会自动处理数组类型,不需要特别指定array
},
"page_id": {
"type": "integer",
# "description": "页码"
},
"question_text": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
# "description": "问题文本内容"
},
"answer_image_url": {
"type": "keyword",
# "description": "答案图片URL"
},
"answer_text": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
# "description": "答案文本内容"
},
"question_chapter": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
},
"book_page_id": {
"type": "keyword",
# "description": "书籍页码标识"
},
"knowledge_point": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
# "description": "知识点内容"
},
"knowledge_point_code": {
"type": "keyword",
# "description": "知识点编码"
},
"difficult": {
"type": "text",
# "description": "题目难度",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
},
},
"resolve": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
},
# "description": "解题思路"
},
"source_image_url": {
"type": "keyword",
},
"page_ocr": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
},
"question_image_url": {
"type": "keyword",
# "description": "问题图片URL"
},
"question_ocr": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
},
"question_image_embedding": {
"type": "dense_vector",
"dims": 1024, # question 图文向量维度 "doubao-embedding-vision-250615"
"index": True,
"similarity": "cosine"
},
"page_image_embedding": {
"type": "dense_vector",
"dims": 1024, # 整页图片 图文向量维度 "doubao-embedding-vision-250615"
"index": True,
"similarity": "cosine"
}
}
}
if settings:
body['settings'] = settings
if mappings:
body['mappings'] = mappings
res = self.es.indices.create(index=index_name, body=body, timeout='100s', master_timeout="1200s") # 关键:将默认30秒延长至120秒[4,9](@ref)
print(f"create index {index_name} success.")
return res
def delete_index(self, index_name):
"""删除索引
Args:
index_name: 索引名称
Returns:
删除结果
"""
res = self.es.indices.delete(index=index_name)
print(f"删除索引 {index_name} success.")
return res
def index_exists(self, index_name):
"""判断索引是否存在
Args:
index_name: 索引名称
Returns:
bool: 是否存在
"""
return self.es.indices.exists(index=index_name)
def read_dict(self, file):
# 1、读入中朝大辞典 ocr解析 并 gpt4o清洗后的数据
with open(file, 'r', encoding='utf-8') as f:
data_list = json.loads(f.read())
res = []
for item in data_list:
temp = {}
for k, v in item.items():
if not v:
pass
else:
temp[k] = v
res.append(copy.deepcopy(temp))
print("len(data_list): ", len(data_list))
return res
def insert_doc(self, index_name, doc_body, doc_id=None):
"""插入文档
Args:
index_name: 索引名称
doc_body: 文档内容
doc_id: 文档ID,可选
Returns:
插入结果
"""
return self.es.index(index=index_name, body=doc_body, id=doc_id)
def insert_doc_batch(self, index_name, doc_batch: list):
# 批量插入es数据库
actions = []
for idx, body in enumerate(doc_batch):
_id = body['bankId']
temp = {"_index": index_name, "_id": _id}
temp['_source'] = body
actions.append(temp)
# 使用helpers.bulk方法批量插入文档
helpers.bulk(self.es, actions)
def insert_doc_file(self, index_name):
# 插入所有的清洗好的数据
path = "/home/liuxin/work/search_question/data"
files = [
"12671977-暑假生活·学期总复习八年级物理通用版6-原图/12671977-暑假生活·学期总复习八年级物理通用版6.xlsx.json"
]
for file in files:
file = os.path.join(path, file)
data_list = self.read_dict( file)
self.insert_doc_batch( index_name, data_list)
print(f"{file} 成功插入{index_name} es库.")
def get_doc(self, index_name, doc_id):
"""获取文档
Args:
index_name: 索引名称
doc_id: 文档ID
Returns:
文档内容
"""
return self.es.get(index=index_name, id=doc_id)
def update_doc(self, index_name, doc_id, doc_body):
"""更新文档
Args:
index_name: 索引名称
doc_id: 文档ID
doc_body: 更新的内容
Returns:
更新结果
"""
return self.es.update(index=index_name, id=doc_id, body={'doc': doc_body})
def delete_doc(self, index_name, doc_id):
"""删除文档
Args:
index_name: 索引名称
doc_id: 文档ID
Returns:
删除结果
"""
return self.es.delete(index=index_name, id=doc_id)
def delete_book_id(self, index_name, book_id):
def delete_book(index_name, book_id):
# 依据 book_id 删除某本书的所有数据
book_id = str(book_id)
for doc in helpers.scan(
self.es,
query={"query": {"term": {"book_id": book_id}}},
index=index_name,
_source=False, # 不返回文档内容[1](@ref)
scroll="10m"
):
yield {
"_op_type": "delete",
"_index": doc["_index"],
"_id": doc["_id"]
}
# 批量执行删除
helpers.bulk(
self.es,
delete_book(index_name, book_id),
chunk_size=5000, # 每批次删除量[1](@ref)
request_timeout=120 # 延长请求超时
)
print(f"成功删除 book_id:{book_id} 的数据.")
def search(self, index_name, query_body):
"""搜索文档
"""
return self.es.search(index=index_name, body=query_body)
def delete_res_key(self, hits):
# 踢除 embedding 向量字段内容后再返回
for hit in hits:
if '_source' in hit and 'page_image_embedding' in hit['_source']:
hit['_source'].pop('page_image_embedding')
if '_source' in hit and 'question_image_embedding' in hit['_source']:
hit['_source'].pop('question_image_embedding')
return hits
async def search_page(self, index_name, book_id, page_ocr, top_k=3):
# 依据用户上传的整图paddleOCR结果和图书book_id,搜索es中的 指定图书的整页图片
query = {
"bool": {
"must": [
{"term": {"book_id": book_id}}, # 精确匹配 book_id
{"match": {"page_ocr": page_ocr}}, # 全文匹配 page_ocr 字段
]
}
}
response = self.es.search(
index=index_name,
body={"query": query},
size=top_k # 返回结果数量
)
hits = response["hits"]["hits"]
hits = self.delete_res_key( hits)
return hits
async def search_question(self, index_name, book_id, question_ocr, top_k=3, page_ids=None):
# 依据用户上传的整题paddleOCR结果和图书book_id,搜索es中的 相关题目
if not page_ids:
query = {
"bool": {
"must": [
{"term": {"book_id": book_id}}, # 精确匹配 book_id
{"match": {"question_ocr": question_ocr}}, # 全文匹配 question_ocr 字段
]
}
}
else:
query = {
"bool": {
"must": [
{"term": {"book_id": book_id}}, # 精确匹配 book_id
{"terms": {"page_id": page_ids}}, # 精确匹配 电子页码
{"match": {"question_ocr": question_ocr}}, # 全文匹配 question_ocr 字段
]
}
}
response = self.es.search(
index=index_name,
body={"query": query},
size=top_k # 返回结果数量
)
hits = response["hits"]["hits"]
hits = self.delete_res_key(hits)
return hits
async def search_page_question(self, index_name, book_id, page_ocr, question_ocr, top_k=2):
# 先使用整页图片的paddleOCR结果搜索题目所在的page_id, 再依据page_id和题目的paddle_ocr结果搜索 es数据库中的题目。
# 每一页的最后一个题,可能在本页和下一页都有部分题目内容,拆书平台下载的excel数据 题目只出现在第一页,固题目会出现在本页或上一页
hits = []
page_res = await self.search_page(index_name, book_id, page_ocr, top_k=1)
if page_res and '_source' in page_res[0] and 'page_id' in page_res[0]['_source']:
page_id = page_res[0]['_source']['page_id']
page_id = int(page_id)
page_ids = [page_id, page_id-1] # 题目会出现在本页或上一页
hits = await self.search_question( index_name, book_id, question_ocr, top_k=top_k, page_ids=page_ids)
hits = self.delete_res_key(hits)
return hits
async def search_question_embedding(self, index_name, book_id, question_image_embedding, top_k=2 ):
# 联合用户拍摄的题目图片以及图片的OCR结果 得到的embedding数据 “question_image_embedding”搜题目。搜题只用了 搜题只用了 embedding数据。
query = {
"bool": {
"filter": [{"term": {"book_id": book_id}}],
"must": [
{
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'question_image_embedding') + 1.0",
"params": {
"query_vector": question_image_embedding,
}
}
}
}
]
}
}
response = self.es.search(
index=index_name,
body={"query": query},
size=top_k # 返回结果数量
)
hits = response["hits"]["hits"]
hits = self.delete_res_key(hits)
return hits
async def search_question_textAndEmbedding(self, index_name, book_id, question_ocr, question_image_embedding, top_k=2 ):
"""
:param question_ocr: 用户拍摄的题目图片ocr结果
:param question_image_embedding: 用户拍摄题目图片ocr和题目图片一起输入模型得到的 embedding
"""
query = {
"script_score": {
"query": {
"bool": {
"filter": [
{"term": {"book_id": book_id}}
],
"should": [
{"match": {"question_ocr": question_ocr}}
],
"minimum_should_match": 0
}
},
# 1、如果没有文本匹配,给个基础分 _score=0.1; 2、余弦相似度计算向量得分;3、使用Sigmoid函数分别归一化分数; 4、加权合并文本匹配和向量匹配
"script": {
"source": """
double rawScore = _score;
double textScore = rawScore <= 0.1 ? 0.1 : (rawScore > 2.0 ? 2.0 : rawScore);
double imageScore = cosineSimilarity(params.query_vector, 'question_image_embedding')+ 1.0;
double normalizedTextScore = 1.0 / (1.0 + Math.exp(-params.text_scale * (textScore - params.text_bias)));
double normalizedImageScore = 1.0 / (1.0 + Math.exp(-params.image_scale * (imageScore - params.image_bias)));
return params.text_weight * normalizedTextScore + params.image_weight * normalizedImageScore;
""",
"params": {
"query_vector": question_image_embedding,
"text_weight": 0.2,
"image_weight": 0.8,
"text_scale": 2.0,
"image_scale": 2.0,
"text_bias": 0.5,
"image_bias": 0.5
}
}
}
}
response = self.es.search(
index=index_name,
body={"query": query},
size=top_k # 返回结果数量
)
hits = response["hits"]["hits"]
hits = self.delete_res_key(hits)
return hits
if __name__ == "__main__":
index_name = "search_question" # rays 搜题
es = EsHelper()
## 0、查看es中有多少条数据
# count = es.es.count(index=index_name)
# print(f'{index_name} :{count} ') # {'count': 307, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
# # # 1、创建新的es索引
# # if es.index_exists( index_name):
# es.delete_index(index_name) # 删除索引
# create_index_res = es.create_index(index_name) # 建立一个新的es 索引
# print('create_index_res: ', create_index_res)
# # 2、es中批量插入数据
# es.insert_doc_file(index_name)
# ## 3、测试查询es库中的数据
# id = '1190766'
# res = es.get_doc(index_name, id)
# print(res)
# ## 4、测试查询es库中的整页数据
# print("\n4、测试查询es库中的整页数据")
# book_id = "12671977" #
# input_ocr = "某篮球运动员在比赛中由于飞身扣篮用力过猛,将篮板玻璃打碎了,他的手腕也同时受伤. 说明了"
# res = es.search_page( index_name, book_id, input_ocr, top_k=3)
# res = asyncio.run(res)
# res = json.dumps(res, ensure_ascii=False, indent=4)
# print(res)
# ## 5、直接查询es库中题目
# print("\n5、查询es库中题目")
# book_id = "12671977"
# question_ocr = "某篮球运动员在比赛中由于飞身扣篮用力过猛,将篮板玻璃打碎了,他的手腕也同时受伤. 说明了"
# res = es.search_question(index_name, book_id, question_ocr, top_k=3)
# res = asyncio.run(res)
# res = json.dumps(res, ensure_ascii=False, indent=4)
# print(res)
# ## 6、先查询es库中page_id,再查询本页或前一页的题目
# print("\n6、先查询es库中page_id,再查询本页或前一页的题目")
# book_id = "12671977"
# page_ocr = "某篮球运动员在比赛中由于飞身扣篮用力过猛,将篮板玻璃打碎了,他的手腕也同时受伤. 说明了"
# question_ocr = "某篮球运动员在比赛中由于飞身扣篮用力过猛,将篮板玻璃打碎了,他的手腕也同时受伤. 说明了"
# res = es.search_page_question( index_name, book_id, page_ocr, question_ocr, top_k=2)
# res = asyncio.run(res)
# res = json.dumps(res, ensure_ascii=False, indent=4)
# print(res)
# 7 依据 book_id 批量删除 某本书的的所有数据
# book_ids =[ '12671977','12670279','12667382', '12664728' ]
# for book_id in book_ids:
# es.delete_book_id(index_name, book_id)
# 8、使用题目向量 搜索 库中题目
book_id = "12667382"
question_image_embedding = [
0.00543212890625,
-0.005035400390625,
0.00714111328125,
0.06591796875,
-0.032958984375,
-0.0380859375,
-0.0255126953125,
-0.046630859375,
0.01031494140625,
0.036376953125,
0.013916015625,
0.0255126953125,
-0.0107421875,
-0.04345703125,
0.0008697509765625,
0.02587890625,
-0.0201416015625,
-0.02099609375,
-0.00830078125,
0.033203125,
-0.01007080078125,
0.0224609375,
-0.04931640625,
0.0206298828125,
0.009765625,
0.02392578125,
0.00095367431640625,
0.03369140625,
0.005035400390625,
0.07373046875,
0.0262451171875,
0.015625,
-0.021728515625,
0.0054931640625,
0.0308837890625,
0.055419921875,
-0.0517578125,
0.011474609375,
-0.023681640625,
0.0235595703125,
-0.00128173828125,
0.0096435546875,
0.003936767578125,
-0.01953125,
-0.05126953125,
0.001617431640625,
-0.005218505859375,
-0.01409912109375,
-0.026611328125,
0.0198974609375,
0.02001953125,
0.004974365234375,
0.053955078125,
-0.0634765625,
-0.0654296875,
-0.03369140625,
-0.08154296875,
0.06005859375,
0.0223388671875,
-0.044921875,
-0.0283203125,
0.01483154296875,
0.150390625,
0.026611328125,
0.02392578125,
-0.00125885009765625,
-0.053955078125,
-0.03759765625,
-0.0390625,
0.02734375,
-0.01385498046875,
0.00140380859375,
0.0218505859375,
-0.01025390625,
-0.0205078125,
0.04638671875,
0.0145263671875,
-0.00927734375,
0.01025390625,
-0.0225830078125,
-0.00909423828125,
-0.0142822265625,
0.02978515625,
0.0101318359375,
-0.072265625,
-0.049560546875,
0.0311279296875,
0.0101318359375,
-0.00445556640625,
-0.03369140625,
0.0615234375,
-0.021240234375,
-0.034912109375,
-0.004150390625,
0.07177734375,
0.0262451171875,
-0.013671875,
0.03369140625,
0.021728515625,
0.0252685546875,
-0.00982666015625,
-0.0028228759765625,
-0.00151824951171875,
0.003753662109375,
0.004791259765625,
-0.03759765625,
0.025146484375,
0.0235595703125,
-0.0103759765625,
0.0084228515625,
-0.0191650390625,
0.044921875,
0.052001953125,
-0.053955078125,
0.0206298828125,
0.036376953125,
0.01202392578125,
0.0196533203125,
0.025146484375,
-0.000457763671875,
-0.0033721923828125,
-0.0101318359375,
0.005035400390625,
0.0089111328125,
0.002227783203125,
-0.041015625,
0.00927734375,
0.10693359375,
0.01043701171875,
0.00616455078125,
0.013916015625,
0.0035858154296875,
-0.0283203125,
-0.06591796875,
0.016845703125,
-0.0615234375,
-0.01385498046875,
-0.05029296875,
-0.01556396484375,
-0.0128173828125,
-0.0140380859375,
0.00732421875,
-0.0164794921875,
0.01239013671875,
0.0150146484375,
-0.0169677734375,
-0.142578125,
-0.0008087158203125,
-0.007171630859375,
0.0033416748046875,
0.03662109375,
-0.00341796875,
-0.0198974609375,
-0.03955078125,
-0.06640625,
-0.05712890625,
0.0133056640625,
-0.0277099609375,
-0.005462646484375,
-0.0224609375,
0.002166748046875,
0.0296630859375,
-0.030517578125,
-0.029052734375,
0.030517578125,
-0.0162353515625,
-0.0252685546875,
-0.0174560546875,
-0.023193359375,
0.005340576171875,
-0.026611328125,
-0.006011962890625,
-0.01611328125,
-0.033203125,
-0.000820159912109375,
0.04248046875,
-0.0301513671875,
0.019287109375,
0.00909423828125,
0.0400390625,
0.03564453125,
0.030517578125,
0.0140380859375,
-0.0014495849609375,
0.01251220703125,
-0.0556640625,
0.04248046875,
-0.056640625,
-0.030517578125,
-0.009033203125,
-0.01165771484375,
-0.0277099609375,
0.0087890625,
0.024658203125,
-0.043701171875,
0.0146484375,
-0.020751953125,
0.01007080078125,
0.0164794921875,
0.06591796875,
-0.06298828125,
0.0303955078125,
-0.020263671875,
-0.007171630859375,
0.004425048828125,
0.0020751953125,
-0.01373291015625,
0.050537109375,
-0.08203125,
-0.036865234375,
0.019775390625,
-0.0050048828125,
0.06640625,
-0.00823974609375,
0.04248046875,
-0.02734375,
0.004425048828125,
0.006500244140625,
-0.00537109375,
-0.0546875,
0.01470947265625,
0.0089111328125,
0.0025787353515625,
-0.00860595703125,
0.018310546875,
-0.0208740234375,
-0.06689453125,
-0.00592041015625,
-0.0303955078125,
-0.0021514892578125,
0.00372314453125,
0.0218505859375,
-0.0301513671875,
0.02587890625,
-0.01239013671875,
0.04541015625,
0.03662109375,
0.07421875,
0.0225830078125,
0.000385284423828125,
0.06396484375,
-0.0020294189453125,
0.008056640625,
0.0111083984375,
0.0081787109375,
0.033447265625,
-0.01544189453125,
0.09912109375,
-0.00994873046875,
0.0208740234375,
-0.044921875,
-0.017333984375,
0.0211181640625,
0.08056640625,
-0.02294921875,
0.062255859375,
0.01708984375,
-0.011474609375,
0.03125,
0.018798828125,
0.0019683837890625,
0.0390625,
-0.0052490234375,
-0.01312255859375,
-0.002044677734375,
-0.046630859375,
0.015625,
0.037353515625,
0.002777099609375,
-0.00811767578125,
0.0233154296875,
-0.007232666015625,
0.039794921875,
0.009765625,
0.01708984375,
0.043212890625,
0.0101318359375,
0.02783203125,
-0.021484375,
-0.01470947265625,
-0.013671875,
0.01470947265625,
-0.032470703125,
-0.0185546875,
-0.00506591796875,
0.03125,
-0.03125,
0.048095703125,
-0.01165771484375,
0.0296630859375,
0.017822265625,
-0.011474609375,
0.00787353515625,
0.005645751953125,
-0.0093994140625,
0.0380859375,
-0.017333984375,
0.0255126953125,
-0.01806640625,
0.033447265625,
-0.001739501953125,
-0.0191650390625,
0.0054931640625,
0.053466796875,
-0.0308837890625,
-0.006072998046875,
0.0179443359375,
0.025634765625,
-0.01318359375,
0.01214599609375,
-0.0040283203125,
0.03125,
0.047119140625,
0.056640625,
0.01116943359375,
-0.0235595703125,
0.00872802734375,
0.003936767578125,
-0.000652313232421875,
-0.004638671875,
-0.00341796875,
0.0035400390625,
0.0322265625,
0.0169677734375,
0.008056640625,
-0.0262451171875,
-0.080078125,
-0.04638671875,
0.017822265625,
-0.01544189453125,
-0.0296630859375,
0.033203125,
0.10693359375,
0.038330078125,
0.00286865234375,
0.04541015625,
0.021484375,
-0.04931640625,
0.01953125,
-0.035400390625,
-0.07958984375,
-0.07177734375,
-0.004119873046875,
0.0208740234375,
0.10498046875,
-0.006011962890625,
-0.04248046875,
-0.045654296875,
-0.001434326171875,
0.00994873046875,
0.0164794921875,
0.0277099609375,
0.006378173828125,
-0.0286865234375,
0.0191650390625,
-0.02587890625,
0.011474609375,
0.0703125,
0.00732421875,
-0.032470703125,
-0.0211181640625,
-0.05615234375,
0.01092529296875,
0.031494140625,
0.0247802734375,
-0.00372314453125,
-0.0013427734375,
-0.06982421875,
-0.00714111328125,
0.0196533203125,
0.017333984375,
0.01336669921875,
-0.00188446044921875,
-0.01019287109375,
-0.00836181640625,
0.035400390625,
-0.0400390625,
0.01177978515625,
0.009033203125,
0.004425048828125,
0.0157470703125,
0.0015716552734375,
0.007354736328125,
-0.037841796875,
0.01214599609375,
-0.002685546875,
-0.0233154296875,
0.0107421875,
-0.052001953125,
-0.057373046875,
0.007354736328125,
0.007171630859375,
-0.02734375,
-0.005523681640625,
-0.0010528564453125,
-0.0106201171875,
0.0022125244140625,
0.08642578125,
-0.033447265625,
0.03125,
-0.0225830078125,
-0.03125,
0.007049560546875,
-0.0712890625,
-0.0023956298828125,
0.006317138671875,
-0.056640625,
0.0206298828125,
-0.017333984375,
0.048828125,
0.0419921875,
-0.0037078857421875,
0.02978515625,
0.0159912109375,
0.04443359375,
-0.060302734375,
-0.038330078125,
0.0277099609375,
0.036865234375,
0.0322265625,
-0.006011962890625,
-0.022705078125,
-0.050537109375,
0.041748046875,
0.01507568359375,
0.007232666015625,
0.050048828125,
0.041015625,
-0.0240478515625,
0.006561279296875,
0.000301361083984375,
-0.01446533203125,
0.01507568359375,
-0.0556640625,
-0.02734375,
-0.0081787109375,
0.04736328125,
0.0025787353515625,
-0.0283203125,
0.01129150390625,
-0.0252685546875,
0.0225830078125,
0.00421142578125,
-0.004791259765625,
0.005401611328125,
0.006011962890625,
-0.057861328125,
-0.0289306640625,
0.008544921875,
-0.0115966796875,
0.022705078125,
-0.008056640625,
-0.01416015625,
0.0021209716796875,
-0.0038909912109375,
0.01025390625,
-0.0247802734375,
0.01275634765625,
0.031494140625,
-0.00015926361083984375,
-0.004119873046875,
0.041259765625,
-0.040283203125,
0.052734375,
-0.052734375,
-0.0225830078125,
-0.06884765625,
-0.07421875,
-0.0341796875,
0.0196533203125,
0.0038604736328125,
-0.041015625,
-0.005615234375,
-0.039794921875,
-0.0205078125,
0.04931640625,
-0.0289306640625,
0.006561279296875,
0.020751953125,
-0.03369140625,
0.020263671875,
-0.039794921875,
0.0081787109375,
-0.0020294189453125,
0.0218505859375,
0.01611328125,
-0.0281982421875,
-0.027099609375,
0.00131988525390625,
-0.0654296875,
0.023681640625,
-0.006256103515625,
-0.0211181640625,
-0.006103515625,
-0.034912109375,
-0.00299072265625,
-0.0059814453125,
-0.036376953125,
-0.0174560546875,
-0.056640625,
0.000316619873046875,
-0.033935546875,
0.00726318359375,
-0.033447265625,
0.06591796875,
0.0018768310546875,
0.02734375,
-0.04345703125,
0.032958984375,
-0.01953125,
-0.0034332275390625,
0.01300048828125,
0.000713348388671875,
-0.008056640625,
-0.004730224609375,
-0.0341796875,
0.030517578125,
-0.0255126953125,
0.0286865234375,
-0.04638671875,
0.025146484375,
-0.03515625,
0.001220703125,
-0.01068115234375,
0.043212890625,
-0.01416015625,
-0.0223388671875,
-0.03369140625,
-0.0059814453125,
0.007049560546875,
0.0220947265625,
-0.0281982421875,
-0.00811767578125,
-0.00799560546875,
0.0263671875,
0.01373291015625,
0.00194549560546875,
0.20703125,
-0.0076904296875,
0.06591796875,
-0.0059814453125,
0.01043701171875,
-0.01251220703125,
0.038330078125,
-0.0087890625,
0.09765625,
0.003082275390625,
0.0096435546875,
0.000152587890625,
0.02392578125,
-0.04638671875,
-0.007781982421875,
-0.0030975341796875,
-0.00098419189453125,
-0.0181884765625,
-0.007232666015625,
-0.0235595703125,
0.033447265625,
0.0125732421875,
-0.00171661376953125,
0.022216796875,
-0.000278472900390625,
-0.02392578125,
0.0294189453125,
-0.0128173828125,
0.039794921875,
-0.0247802734375,
-0.049560546875,
0.003631591796875,
0.0206298828125,
0.0164794921875,
0.028564453125,
-0.01251220703125,
0.02294921875,
-0.00099945068359375,
-0.022216796875,
-0.050537109375,
0.001861572265625,
-0.0191650390625,
-0.045654296875,
-0.023681640625,
0.004119873046875,
0.050048828125,
-0.007232666015625,
0.05029296875,
-0.027099609375,
-0.0169677734375,
0.035888671875,
-0.02001953125,
-0.0040283203125,
0.01043701171875,
0.0035400390625,
-0.0059814453125,
0.0263671875,
-0.01220703125,
-0.016357421875,
0.01483154296875,
-0.0390625,
-0.00396728515625,
0.04345703125,
0.023193359375,
-0.0172119140625,
0.021240234375,
-0.04736328125,
0.006866455078125,
0.036376953125,
-0.0289306640625,
0.0208740234375,
0.011474609375,
-0.0260009765625,
0.01904296875,
0.016357421875,
-0.04931640625,
-0.0478515625,
0.0213623046875,
-0.01275634765625,
-0.032470703125,
-0.0084228515625,
-0.001922607421875,
0.0191650390625,
0.008544921875,
-0.037353515625,
0.0111083984375,
-0.022705078125,
-0.04052734375,
0.033447265625,
-0.01116943359375,
-0.03369140625,
-0.0167236328125,
-0.0303955078125,
0.0191650390625,
-0.06591796875,
0.00433349609375,
-0.078125,
-0.00732421875,
0.04150390625,
0.00286865234375,
0.0157470703125,
-0.00008440017700195312,
0.041015625,
-0.034423828125,
0.053466796875,
0.005218505859375,
0.01239013671875,
-0.006072998046875,
-0.00628662109375,
0.018798828125,
0.00151824951171875,
0.01165771484375,
0.0164794921875,
0.0089111328125,
0.0247802734375,
0.024658203125,
-0.0031585693359375,
-0.0181884765625,
-0.0103759765625,
-0.04541015625,
0.00323486328125,
0.0157470703125,
-0.004791259765625,
0.0308837890625,
-0.03076171875,
0.00506591796875,
0.02392578125,
-0.0179443359375,
0.0269775390625,
-0.003509521484375,
-0.0181884765625,
0.020263671875,
0.03076171875,
0.00145721435546875,
-0.004547119140625,
0.0167236328125,
0.035400390625,
-0.0206298828125,
-0.0810546875,
-0.0113525390625,
-0.02294921875,
0.0022735595703125,
0.0250244140625,
-0.006561279296875,
0.0223388671875,
0.043212890625,
0.03564453125,
-0.009765625,
0.036376953125,
-0.0028228759765625,
-0.003204345703125,
-0.041748046875,
-0.001495361328125,
-0.05419921875,
-0.04248046875,
-0.01708984375,
-0.002777099609375,
-0.01251220703125,
-0.001739501953125,
-0.0084228515625,
0.00130462646484375,
-0.000579833984375,
-0.0262451171875,
0.01446533203125,
0.037841796875,
0.02001953125,
-0.0157470703125,
-0.017333984375,
0.006317138671875,
0.0125732421875,
-0.0206298828125,
0.02197265625,
0.012939453125,
-0.0162353515625,
-0.068359375,
0.0250244140625,
0.006317138671875,
0.00201416015625,
-0.005126953125,
-0.00872802734375,
-0.06591796875,
0.0233154296875,
-0.01806640625,
-0.01312255859375,
-0.042236328125,
-0.03125,
-0.0198974609375,
-0.00193023681640625,
-0.004364013671875,
-0.0262451171875,
-0.0439453125,
-0.020263671875,
-0.01043701171875,
-0.0250244140625,
0.036865234375,
0.00946044921875,
-0.012939453125,
0.0213623046875,
-0.032470703125,
-0.048095703125,
0.020263671875,
0.00482177734375,
0.048583984375,
0.0301513671875,
-0.046630859375,
-0.021728515625,
-0.00408935546875,
0.0164794921875,
-0.006591796875,
-0.01409912109375,
0.0439453125,
-0.003936767578125,
-0.053955078125,
-0.0123291015625,
-0.025146484375,
-0.0185546875,
-0.061767578125,
0.0045166015625,
-0.01348876953125,
-0.025634765625,
0.0013580322265625,
-0.033447265625,
-0.0093994140625,
-0.039794921875,
0.04638671875,
-0.00799560546875,
-0.03955078125,
-0.015869140625,
0.0322265625,
0.0093994140625,
0.02978515625,
0.005828857421875,
-0.00011873245239257812,
-0.003936767578125,
0.0274658203125,
0.00701904296875,
0.000858306884765625,
-0.0146484375,
0.020263671875,
0.04150390625,
0.0216064453125,
-0.0164794921875,
0.043212890625,
0.0546875,
0.050048828125,
0.01470947265625,
-0.006561279296875,
0.00958251953125,
-0.0164794921875,
-0.01177978515625,
-0.03564453125,
-0.009765625,
0.007598876953125,
-0.00133514404296875,
-0.0322265625,
0.0211181640625,
0.029296875,
0.009765625,
-0.00151824951171875,
-0.0019073486328125,
-0.00830078125,
-0.032958984375,
0.028564453125,
-0.0186767578125,
-0.03369140625,
0.00897216796875,
-0.0040283203125,
-0.01104736328125,
0.01434326171875,
0.05419921875,
0.012939453125,
-0.0341796875,
-0.002471923828125,
-0.01446533203125,
0.04150390625,
0.04541015625,
0.0203857421875,
0.01251220703125,
-0.01348876953125,
0.01312255859375,
0.0052490234375,
0.0419921875,
-0.01092529296875,
-0.022705078125,
-0.0252685546875,
0.00836181640625,
-0.03759765625,
-0.03076171875,
0.009033203125,
-0.017822265625,
-0.00982666015625,
0.04736328125,
0.025634765625,
0.01165771484375,
-0.032470703125,
-0.0267333984375,
0.03515625,
0.0012664794921875,
-0.061767578125,
-0.0089111328125,
0.050048828125,
0.01275634765625,
0.0250244140625,
0.0191650390625,
0.036376953125,
0.005828857421875,
-0.0047607421875,
0.0023956298828125,
-0.031494140625,
-0.003143310546875,
-0.004669189453125,
0.0194091796875,
0.00897216796875,
-0.01104736328125,
0.0849609375,
0.007232666015625,
-0.0201416015625,
-0.0260009765625,
-0.0172119140625,
0.08984375,
0.0517578125,
-0.0260009765625,
-0.0260009765625,
-0.01239013671875,
-0.0247802734375,
0.0164794921875,
-0.0225830078125,
0.018798828125,
-0.038330078125,
-0.0308837890625,
-0.024658203125,
-0.002227783203125,
0.003204345703125,
0.0185546875,
-0.04443359375,
-0.004302978515625,
0.03271484375,
-0.01507568359375,
-0.00080108642578125,
0.038818359375,
0.0040283203125,
0.007049560546875,
-0.0198974609375,
0.004364013671875,
-0.02392578125,
-0.029296875,
-0.0037994384765625,
-0.033447265625,
0.01348876953125,
0.058837890625,
0.00811767578125,
0.0341796875,
0.0064697265625,
0.025146484375,
-0.000492095947265625,
0.0289306640625,
-0.0074462890625,
0.0189208984375,
0.031494140625,
-0.00518798828125,
-0.0283203125,
-0.0011444091796875,
0.03857421875,
0.036376953125,
0.026611328125,
0.005615234375,
-0.0032958984375,
-0.025146484375,
-0.02197265625,
0.0194091796875,
0.0181884765625,
0.07275390625,
-0.0045166015625,
0.025634765625,
-0.0286865234375,
0.0103759765625,
-0.01141357421875,
-0.03564453125,
0.049560546875,
-0.03173828125,
-0.0269775390625,
0.02734375,
0.041015625,
-0.01953125,
-0.0003528594970703125,
-0.0133056640625,
0.0203857421875,
-0.011474609375,
0.0150146484375,
0.0001010894775390625,
-0.01165771484375,
-0.026611328125,
0.03564453125,
-0.012939453125,
0.00024127960205078125,
-0.01446533203125,
0.0322265625,
-0.03125,
-0.0093994140625,
0.00066375732421875,
-0.02197265625,
0.0115966796875,
-0.033935546875,
-0.045166015625,
0.0361328125,
0.017822265625,
0.0322265625,
-0.0263671875,
0.04345703125,
-0.0133056640625,
-0.0478515625,
0.0087890625,
0.0208740234375,
-0.014892578125,
0.0135498046875,
0.044921875,
-0.006011962890625,
-0.0042724609375,
-0.0242919921875,
0.025146484375,
-0.021728515625,
-0.033935546875,
-0.0198974609375,
0.0172119140625,
0.00958251953125,
0.01806640625,
0.04345703125,
-0.01104736328125,
0.036376953125,
0.00141143798828125,
-0.03466796875,
-0.0001964569091796875,
-0.035400390625,
0.0191650390625,
0.0031280517578125,
0.00848388671875,
0.03076171875,
-0.0277099609375,
0.01953125,
0.08447265625,
0.0169677734375,
-0.02783203125,
-0.045166015625,
0.0181884765625,
0.039794921875,
-0.023681640625,
-0.00701904296875,
0.040771484375,
-0.020751953125,
0.06689453125,
-0.048828125,
-0.00164031982421875,
0.006256103515625,
0.01007080078125,
0.00323486328125,
-0.03662109375,
0.0419921875,
0.0002651214599609375,
-0.0301513671875,
-0.0111083984375,
0.072265625,
0.10009765625,
0.03564453125,
-0.0032501220703125,
0.06103515625,
0.0103759765625,
0.011474609375,
-0.034912109375,
-0.0115966796875,
0.025634765625,
-0.062255859375,
-0.01416015625,
0.037841796875,
-0.006683349609375,
-0.04736328125,
0.04345703125,
0.0021514892578125,
-0.0311279296875,
0.00885009765625,
-0.0107421875,
0.00372314453125,
-0.021728515625,
-0.00020503997802734375,
0.000263214111328125,
-0.01446533203125,
0.0028228759765625,
0.0277099609375
]
# res = es.search_question_embedding( index_name, book_id, question_image_embedding, top_k=2)
# res = asyncio.run(res)
# res = json.dumps(res, ensure_ascii=False, indent=4)
# print(res)
# 9、联合向量搜索和文本搜索,合并二则的打分排序
book_id = "12667382"
question_image_embedding = [
0.00543212890625,
-0.005035400390625,
0.00714111328125,
0.06591796875,
-0.032958984375,
-0.0380859375,
-0.0255126953125,
-0.046630859375,
0.01031494140625,
0.036376953125,
0.013916015625,
0.0255126953125,
-0.0107421875,
-0.04345703125,
0.0008697509765625,
0.02587890625,
-0.0201416015625,
-0.02099609375,
-0.00830078125,
0.033203125,
-0.01007080078125,
0.0224609375,
-0.04931640625,
0.0206298828125,
0.009765625,
0.02392578125,
0.00095367431640625,
0.03369140625,
0.005035400390625,
0.07373046875,
0.0262451171875,
0.015625,
-0.021728515625,
0.0054931640625,
0.0308837890625,
0.055419921875,
-0.0517578125,
0.011474609375,
-0.023681640625,
0.0235595703125,
-0.00128173828125,
0.0096435546875,
0.003936767578125,
-0.01953125,
-0.05126953125,
0.001617431640625,
-0.005218505859375,
-0.01409912109375,
-0.026611328125,
0.0198974609375,
0.02001953125,
0.004974365234375,
0.053955078125,
-0.0634765625,
-0.0654296875,
-0.03369140625,
-0.08154296875,
0.06005859375,
0.0223388671875,
-0.044921875,
-0.0283203125,
0.01483154296875,
0.150390625,
0.026611328125,
0.02392578125,
-0.00125885009765625,
-0.053955078125,
-0.03759765625,
-0.0390625,
0.02734375,
-0.01385498046875,
0.00140380859375,
0.0218505859375,
-0.01025390625,
-0.0205078125,
0.04638671875,
0.0145263671875,
-0.00927734375,
0.01025390625,
-0.0225830078125,
-0.00909423828125,
-0.0142822265625,
0.02978515625,
0.0101318359375,
-0.072265625,
-0.049560546875,
0.0311279296875,
0.0101318359375,
-0.00445556640625,
-0.03369140625,
0.0615234375,
-0.021240234375,
-0.034912109375,
-0.004150390625,
0.07177734375,
0.0262451171875,
-0.013671875,
0.03369140625,
0.021728515625,
0.0252685546875,
-0.00982666015625,
-0.0028228759765625,
-0.00151824951171875,
0.003753662109375,
0.004791259765625,
-0.03759765625,
0.025146484375,
0.0235595703125,
-0.0103759765625,
0.0084228515625,
-0.0191650390625,
0.044921875,
0.052001953125,
-0.053955078125,
0.0206298828125,
0.036376953125,
0.01202392578125,
0.0196533203125,
0.025146484375,
-0.000457763671875,
-0.0033721923828125,
-0.0101318359375,
0.005035400390625,
0.0089111328125,
0.002227783203125,
-0.041015625,
0.00927734375,
0.10693359375,
0.01043701171875,
0.00616455078125,
0.013916015625,
0.0035858154296875,
-0.0283203125,
-0.06591796875,
0.016845703125,
-0.0615234375,
-0.01385498046875,
-0.05029296875,
-0.01556396484375,
-0.0128173828125,
-0.0140380859375,
0.00732421875,
-0.0164794921875,
0.01239013671875,
0.0150146484375,
-0.0169677734375,
-0.142578125,
-0.0008087158203125,
-0.007171630859375,
0.0033416748046875,
0.03662109375,
-0.00341796875,
-0.0198974609375,
-0.03955078125,
-0.06640625,
-0.05712890625,
0.0133056640625,
-0.0277099609375,
-0.005462646484375,
-0.0224609375,
0.002166748046875,
0.0296630859375,
-0.030517578125,
-0.029052734375,
0.030517578125,
-0.0162353515625,
-0.0252685546875,
-0.0174560546875,
-0.023193359375,
0.005340576171875,
-0.026611328125,
-0.006011962890625,
-0.01611328125,
-0.033203125,
-0.000820159912109375,
0.04248046875,
-0.0301513671875,
0.019287109375,
0.00909423828125,
0.0400390625,
0.03564453125,
0.030517578125,
0.0140380859375,
-0.0014495849609375,
0.01251220703125,
-0.0556640625,
0.04248046875,
-0.056640625,
-0.030517578125,
-0.009033203125,
-0.01165771484375,
-0.0277099609375,
0.0087890625,
0.024658203125,
-0.043701171875,
0.0146484375,
-0.020751953125,
0.01007080078125,
0.0164794921875,
0.06591796875,
-0.06298828125,
0.0303955078125,
-0.020263671875,
-0.007171630859375,
0.004425048828125,
0.0020751953125,
-0.01373291015625,
0.050537109375,
-0.08203125,
-0.036865234375,
0.019775390625,
-0.0050048828125,
0.06640625,
-0.00823974609375,
0.04248046875,
-0.02734375,
0.004425048828125,
0.006500244140625,
-0.00537109375,
-0.0546875,
0.01470947265625,
0.0089111328125,
0.0025787353515625,
-0.00860595703125,
0.018310546875,
-0.0208740234375,
-0.06689453125,
-0.00592041015625,
-0.0303955078125,
-0.0021514892578125,
0.00372314453125,
0.0218505859375,
-0.0301513671875,
0.02587890625,
-0.01239013671875,
0.04541015625,
0.03662109375,
0.07421875,
0.0225830078125,
0.000385284423828125,
0.06396484375,
-0.0020294189453125,
0.008056640625,
0.0111083984375,
0.0081787109375,
0.033447265625,
-0.01544189453125,
0.09912109375,
-0.00994873046875,
0.0208740234375,
-0.044921875,
-0.017333984375,
0.0211181640625,
0.08056640625,
-0.02294921875,
0.062255859375,
0.01708984375,
-0.011474609375,
0.03125,
0.018798828125,
0.0019683837890625,
0.0390625,
-0.0052490234375,
-0.01312255859375,
-0.002044677734375,
-0.046630859375,
0.015625,
0.037353515625,
0.002777099609375,
-0.00811767578125,
0.0233154296875,
-0.007232666015625,
0.039794921875,
0.009765625,
0.01708984375,
0.043212890625,
0.0101318359375,
0.02783203125,
-0.021484375,
-0.01470947265625,
-0.013671875,
0.01470947265625,
-0.032470703125,
-0.0185546875,
-0.00506591796875,
0.03125,
-0.03125,
0.048095703125,
-0.01165771484375,
0.0296630859375,
0.017822265625,
-0.011474609375,
0.00787353515625,
0.005645751953125,
-0.0093994140625,
0.0380859375,
-0.017333984375,
0.0255126953125,
-0.01806640625,
0.033447265625,
-0.001739501953125,
-0.0191650390625,
0.0054931640625,
0.053466796875,
-0.0308837890625,
-0.006072998046875,
0.0179443359375,
0.025634765625,
-0.01318359375,
0.01214599609375,
-0.0040283203125,
0.03125,
0.047119140625,
0.056640625,
0.01116943359375,
-0.0235595703125,
0.00872802734375,
0.003936767578125,
-0.000652313232421875,
-0.004638671875,
-0.00341796875,
0.0035400390625,
0.0322265625,
0.0169677734375,
0.008056640625,
-0.0262451171875,
-0.080078125,
-0.04638671875,
0.017822265625,
-0.01544189453125,
-0.0296630859375,
0.033203125,
0.10693359375,
0.038330078125,
0.00286865234375,
0.04541015625,
0.021484375,
-0.04931640625,
0.01953125,
-0.035400390625,
-0.07958984375,
-0.07177734375,
-0.004119873046875,
0.0208740234375,
0.10498046875,
-0.006011962890625,
-0.04248046875,
-0.045654296875,
-0.001434326171875,
0.00994873046875,
0.0164794921875,
0.0277099609375,
0.006378173828125,
-0.0286865234375,
0.0191650390625,
-0.02587890625,
0.011474609375,
0.0703125,
0.00732421875,
-0.032470703125,
-0.0211181640625,
-0.05615234375,
0.01092529296875,
0.031494140625,
0.0247802734375,
-0.00372314453125,
-0.0013427734375,
-0.06982421875,
-0.00714111328125,
0.0196533203125,
0.017333984375,
0.01336669921875,
-0.00188446044921875,
-0.01019287109375,
-0.00836181640625,
0.035400390625,
-0.0400390625,
0.01177978515625,
0.009033203125,
0.004425048828125,
0.0157470703125,
0.0015716552734375,
0.007354736328125,
-0.037841796875,
0.01214599609375,
-0.002685546875,
-0.0233154296875,
0.0107421875,
-0.052001953125,
-0.057373046875,
0.007354736328125,
0.007171630859375,
-0.02734375,
-0.005523681640625,
-0.0010528564453125,
-0.0106201171875,
0.0022125244140625,
0.08642578125,
-0.033447265625,
0.03125,
-0.0225830078125,
-0.03125,
0.007049560546875,
-0.0712890625,
-0.0023956298828125,
0.006317138671875,
-0.056640625,
0.0206298828125,
-0.017333984375,
0.048828125,
0.0419921875,
-0.0037078857421875,
0.02978515625,
0.0159912109375,
0.04443359375,
-0.060302734375,
-0.038330078125,
0.0277099609375,
0.036865234375,
0.0322265625,
-0.006011962890625,
-0.022705078125,
-0.050537109375,
0.041748046875,
0.01507568359375,
0.007232666015625,
0.050048828125,
0.041015625,
-0.0240478515625,
0.006561279296875,
0.000301361083984375,
-0.01446533203125,
0.01507568359375,
-0.0556640625,
-0.02734375,
-0.0081787109375,
0.04736328125,
0.0025787353515625,
-0.0283203125,
0.01129150390625,
-0.0252685546875,
0.0225830078125,
0.00421142578125,
-0.004791259765625,
0.005401611328125,
0.006011962890625,
-0.057861328125,
-0.0289306640625,
0.008544921875,
-0.0115966796875,
0.022705078125,
-0.008056640625,
-0.01416015625,
0.0021209716796875,
-0.0038909912109375,
0.01025390625,
-0.0247802734375,
0.01275634765625,
0.031494140625,
-0.00015926361083984375,
-0.004119873046875,
0.041259765625,
-0.040283203125,
0.052734375,
-0.052734375,
-0.0225830078125,
-0.06884765625,
-0.07421875,
-0.0341796875,
0.0196533203125,
0.0038604736328125,
-0.041015625,
-0.005615234375,
-0.039794921875,
-0.0205078125,
0.04931640625,
-0.0289306640625,
0.006561279296875,
0.020751953125,
-0.03369140625,
0.020263671875,
-0.039794921875,
0.0081787109375,
-0.0020294189453125,
0.0218505859375,
0.01611328125,
-0.0281982421875,
-0.027099609375,
0.00131988525390625,
-0.0654296875,
0.023681640625,
-0.006256103515625,
-0.0211181640625,
-0.006103515625,
-0.034912109375,
-0.00299072265625,
-0.0059814453125,
-0.036376953125,
-0.0174560546875,
-0.056640625,
0.000316619873046875,
-0.033935546875,
0.00726318359375,
-0.033447265625,
0.06591796875,
0.0018768310546875,
0.02734375,
-0.04345703125,
0.032958984375,
-0.01953125,
-0.0034332275390625,
0.01300048828125,
0.000713348388671875,
-0.008056640625,
-0.004730224609375,
-0.0341796875,
0.030517578125,
-0.0255126953125,
0.0286865234375,
-0.04638671875,
0.025146484375,
-0.03515625,
0.001220703125,
-0.01068115234375,
0.043212890625,
-0.01416015625,
-0.0223388671875,
-0.03369140625,
-0.0059814453125,
0.007049560546875,
0.0220947265625,
-0.0281982421875,
-0.00811767578125,
-0.00799560546875,
0.0263671875,
0.01373291015625,
0.00194549560546875,
0.20703125,
-0.0076904296875,
0.06591796875,
-0.0059814453125,
0.01043701171875,
-0.01251220703125,
0.038330078125,
-0.0087890625,
0.09765625,
0.003082275390625,
0.0096435546875,
0.000152587890625,
0.02392578125,
-0.04638671875,
-0.007781982421875,
-0.0030975341796875,
-0.00098419189453125,
-0.0181884765625,
-0.007232666015625,
-0.0235595703125,
0.033447265625,
0.0125732421875,
-0.00171661376953125,
0.022216796875,
-0.000278472900390625,
-0.02392578125,
0.0294189453125,
-0.0128173828125,
0.039794921875,
-0.0247802734375,
-0.049560546875,
0.003631591796875,
0.0206298828125,
0.0164794921875,
0.028564453125,
-0.01251220703125,
0.02294921875,
-0.00099945068359375,
-0.022216796875,
-0.050537109375,
0.001861572265625,
-0.0191650390625,
-0.045654296875,
-0.023681640625,
0.004119873046875,
0.050048828125,
-0.007232666015625,
0.05029296875,
-0.027099609375,
-0.0169677734375,
0.035888671875,
-0.02001953125,
-0.0040283203125,
0.01043701171875,
0.0035400390625,
-0.0059814453125,
0.0263671875,
-0.01220703125,
-0.016357421875,
0.01483154296875,
-0.0390625,
-0.00396728515625,
0.04345703125,
0.023193359375,
-0.0172119140625,
0.021240234375,
-0.04736328125,
0.006866455078125,
0.036376953125,
-0.0289306640625,
0.0208740234375,
0.011474609375,
-0.0260009765625,
0.01904296875,
0.016357421875,
-0.04931640625,
-0.0478515625,
0.0213623046875,
-0.01275634765625,
-0.032470703125,
-0.0084228515625,
-0.001922607421875,
0.0191650390625,
0.008544921875,
-0.037353515625,
0.0111083984375,
-0.022705078125,
-0.04052734375,
0.033447265625,
-0.01116943359375,
-0.03369140625,
-0.0167236328125,
-0.0303955078125,
0.0191650390625,
-0.06591796875,
0.00433349609375,
-0.078125,
-0.00732421875,
0.04150390625,
0.00286865234375,
0.0157470703125,
-0.00008440017700195312,
0.041015625,
-0.034423828125,
0.053466796875,
0.005218505859375,
0.01239013671875,
-0.006072998046875,
-0.00628662109375,
0.018798828125,
0.00151824951171875,
0.01165771484375,
0.0164794921875,
0.0089111328125,
0.0247802734375,
0.024658203125,
-0.0031585693359375,
-0.0181884765625,
-0.0103759765625,
-0.04541015625,
0.00323486328125,
0.0157470703125,
-0.004791259765625,
0.0308837890625,
-0.03076171875,
0.00506591796875,
0.02392578125,
-0.0179443359375,
0.0269775390625,
-0.003509521484375,
-0.0181884765625,
0.020263671875,
0.03076171875,
0.00145721435546875,
-0.004547119140625,
0.0167236328125,
0.035400390625,
-0.0206298828125,
-0.0810546875,
-0.0113525390625,
-0.02294921875,
0.0022735595703125,
0.0250244140625,
-0.006561279296875,
0.0223388671875,
0.043212890625,
0.03564453125,
-0.009765625,
0.036376953125,
-0.0028228759765625,
-0.003204345703125,
-0.041748046875,
-0.001495361328125,
-0.05419921875,
-0.04248046875,
-0.01708984375,
-0.002777099609375,
-0.01251220703125,
-0.001739501953125,
-0.0084228515625,
0.00130462646484375,
-0.000579833984375,
-0.0262451171875,
0.01446533203125,
0.037841796875,
0.02001953125,
-0.0157470703125,
-0.017333984375,
0.006317138671875,
0.0125732421875,
-0.0206298828125,
0.02197265625,
0.012939453125,
-0.0162353515625,
-0.068359375,
0.0250244140625,
0.006317138671875,
0.00201416015625,
-0.005126953125,
-0.00872802734375,
-0.06591796875,
0.0233154296875,
-0.01806640625,
-0.01312255859375,
-0.042236328125,
-0.03125,
-0.0198974609375,
-0.00193023681640625,
-0.004364013671875,
-0.0262451171875,
-0.0439453125,
-0.020263671875,
-0.01043701171875,
-0.0250244140625,
0.036865234375,
0.00946044921875,
-0.012939453125,
0.0213623046875,
-0.032470703125,
-0.048095703125,
0.020263671875,
0.00482177734375,
0.048583984375,
0.0301513671875,
-0.046630859375,
-0.021728515625,
-0.00408935546875,
0.0164794921875,
-0.006591796875,
-0.01409912109375,
0.0439453125,
-0.003936767578125,
-0.053955078125,
-0.0123291015625,
-0.025146484375,
-0.0185546875,
-0.061767578125,
0.0045166015625,
-0.01348876953125,
-0.025634765625,
0.0013580322265625,
-0.033447265625,
-0.0093994140625,
-0.039794921875,
0.04638671875,
-0.00799560546875,
-0.03955078125,
-0.015869140625,
0.0322265625,
0.0093994140625,
0.02978515625,
0.005828857421875,
-0.00011873245239257812,
-0.003936767578125,
0.0274658203125,
0.00701904296875,
0.000858306884765625,
-0.0146484375,
0.020263671875,
0.04150390625,
0.0216064453125,
-0.0164794921875,
0.043212890625,
0.0546875,
0.050048828125,
0.01470947265625,
-0.006561279296875,
0.00958251953125,
-0.0164794921875,
-0.01177978515625,
-0.03564453125,
-0.009765625,
0.007598876953125,
-0.00133514404296875,
-0.0322265625,
0.0211181640625,
0.029296875,
0.009765625,
-0.00151824951171875,
-0.0019073486328125,
-0.00830078125,
-0.032958984375,
0.028564453125,
-0.0186767578125,
-0.03369140625,
0.00897216796875,
-0.0040283203125,
-0.01104736328125,
0.01434326171875,
0.05419921875,
0.012939453125,
-0.0341796875,
-0.002471923828125,
-0.01446533203125,
0.04150390625,
0.04541015625,
0.0203857421875,
0.01251220703125,
-0.01348876953125,
0.01312255859375,
0.0052490234375,
0.0419921875,
-0.01092529296875,
-0.022705078125,
-0.0252685546875,
0.00836181640625,
-0.03759765625,
-0.03076171875,
0.009033203125,
-0.017822265625,
-0.00982666015625,
0.04736328125,
0.025634765625,
0.01165771484375,
-0.032470703125,
-0.0267333984375,
0.03515625,
0.0012664794921875,
-0.061767578125,
-0.0089111328125,
0.050048828125,
0.01275634765625,
0.0250244140625,
0.0191650390625,
0.036376953125,
0.005828857421875,
-0.0047607421875,
0.0023956298828125,
-0.031494140625,
-0.003143310546875,
-0.004669189453125,
0.0194091796875,
0.00897216796875,
-0.01104736328125,
0.0849609375,
0.007232666015625,
-0.0201416015625,
-0.0260009765625,
-0.0172119140625,
0.08984375,
0.0517578125,
-0.0260009765625,
-0.0260009765625,
-0.01239013671875,
-0.0247802734375,
0.0164794921875,
-0.0225830078125,
0.018798828125,
-0.038330078125,
-0.0308837890625,
-0.024658203125,
-0.002227783203125,
0.003204345703125,
0.0185546875,
-0.04443359375,
-0.004302978515625,
0.03271484375,
-0.01507568359375,
-0.00080108642578125,
0.038818359375,
0.0040283203125,
0.007049560546875,
-0.0198974609375,
0.004364013671875,
-0.02392578125,
-0.029296875,
-0.0037994384765625,
-0.033447265625,
0.01348876953125,
0.058837890625,
0.00811767578125,
0.0341796875,
0.0064697265625,
0.025146484375,
-0.000492095947265625,
0.0289306640625,
-0.0074462890625,
0.0189208984375,
0.031494140625,
-0.00518798828125,
-0.0283203125,
-0.0011444091796875,
0.03857421875,
0.036376953125,
0.026611328125,
0.005615234375,
-0.0032958984375,
-0.025146484375,
-0.02197265625,
0.0194091796875,
0.0181884765625,
0.07275390625,
-0.0045166015625,
0.025634765625,
-0.0286865234375,
0.0103759765625,
-0.01141357421875,
-0.03564453125,
0.049560546875,
-0.03173828125,
-0.0269775390625,
0.02734375,
0.041015625,
-0.01953125,
-0.0003528594970703125,
-0.0133056640625,
0.0203857421875,
-0.011474609375,
0.0150146484375,
0.0001010894775390625,
-0.01165771484375,
-0.026611328125,
0.03564453125,
-0.012939453125,
0.00024127960205078125,
-0.01446533203125,
0.0322265625,
-0.03125,
-0.0093994140625,
0.00066375732421875,
-0.02197265625,
0.0115966796875,
-0.033935546875,
-0.045166015625,
0.0361328125,
0.017822265625,
0.0322265625,
-0.0263671875,
0.04345703125,
-0.0133056640625,
-0.0478515625,
0.0087890625,
0.0208740234375,
-0.014892578125,
0.0135498046875,
0.044921875,
-0.006011962890625,
-0.0042724609375,
-0.0242919921875,
0.025146484375,
-0.021728515625,
-0.033935546875,
-0.0198974609375,
0.0172119140625,
0.00958251953125,
0.01806640625,
0.04345703125,
-0.01104736328125,
0.036376953125,
0.00141143798828125,
-0.03466796875,
-0.0001964569091796875,
-0.035400390625,
0.0191650390625,
0.0031280517578125,
0.00848388671875,
0.03076171875,
-0.0277099609375,
0.01953125,
0.08447265625,
0.0169677734375,
-0.02783203125,
-0.045166015625,
0.0181884765625,
0.039794921875,
-0.023681640625,
-0.00701904296875,
0.040771484375,
-0.020751953125,
0.06689453125,
-0.048828125,
-0.00164031982421875,
0.006256103515625,
0.01007080078125,
0.00323486328125,
-0.03662109375,
0.0419921875,
0.0002651214599609375,
-0.0301513671875,
-0.0111083984375,
0.072265625,
0.10009765625,
0.03564453125,
-0.0032501220703125,
0.06103515625,
0.0103759765625,
0.011474609375,
-0.034912109375,
-0.0115966796875,
0.025634765625,
-0.062255859375,
-0.01416015625,
0.037841796875,
-0.006683349609375,
-0.04736328125,
0.04345703125,
0.0021514892578125,
-0.0311279296875,
0.00885009765625,
-0.0107421875,
0.00372314453125,
-0.021728515625,
-0.00020503997802734375,
0.000263214111328125,
-0.01446533203125,
0.0028228759765625,
0.0277099609375
]
question_ocr = "画一画"
# res = es.search_question_textAndEmbedding( index_name, book_id, question_ocr, question_image_embedding, top_k=2)
# res = asyncio.run(res)
# res = json.dumps(res, ensure_ascii=False, indent=4)
# print(res)
import requests,re,os,sys
import base64
import json
import time
from volcenginesdkarkruntime import Ark # pip install -U volcengine-python-sdk[ark]
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
ocr_example = {"/home/liuxin/work/search_question/data/12677472-语文1-160.pdf.xlsx-题目-原图/1151411.jpg": """"
四、2022·浙江,7~9)阅读下面的文字,完成
1~3题。
中国食客说起中华美食之道,往往喜欢引用
孔子的“食不厌精,脍不厌细”八个字。其实,孔
子所言的“食不厌精,脍不厌细”,更侧重于祭祀
时饮食的态度而非对味道的追求。孔子生活的
春秋末期,烹饪、碓春、切肉工艺均相对原始,将
“食”做“精”“脍”做“细”,体现了厨人与食者严
肃认真的态度。孔子的饮食观背后,是其心怀的
礼制。《礼记》所言“夫礼之初,始诸饮食”,大意
即是“礼仪制度和风俗习惯始于饮食活动”
古代中国对食物的“淡漠”不仅出于食材的
积累、交融的缓慢,更在于儒家对口腹之欲的“打
压”
。一方面,孔子“君子谋道不谋食”的教诲让
士大夫阶层往往远离庖厨而以修齐治平为已任:
另一方面,自汉武帝刘彻“罢黜百家,独尊儒术
后,士大夫阶层仕途通畅,“学而优则仕”也有着
丰富的现实回报。至晚在唐代之前,文人对于饮
食之事是少有重视的。
隋唐时期饮食文化尤其是宴席之风虽有较
大发展,但在盛世文治武功的影响下,士大夫阶
层的追求依然在“提笔安天下、马上定乾坤”之
中,“烹羊宰牛”式的盛筵并没有孕育出与之相当
的饮食文化。唐代盛极一时的“烧尾宴”,也只是
公卿士大夫的盛宴,远非平民百姓所能享受。
转折来自于两宋:从个体角度来看,两宋文
化昌盛导致读书人与日俱增以至于仕途门槛抬
高,同时武功疲弱又令多少人壮志难酬;从朝廷
角度来看,宋室有鉴于唐朝藩镇割据之痛,自宋
太祖赵匡胤“杯酒释兵权”始便鼓励朝臣“择便好
田宅市之,为子孙立永远之业,多致歌儿舞女,日
饮酒相欢,以终其天年”。用舍行藏之下,也不由
得士大夫们不将视线转向饮食了。
元朝统一后,汉族士人愈加边缘化。明清易
代,朝廷中枢又多为满族垄断,“学而优则仕”的
路途不再畅通无阻,文人的兴趣自然而然愈加转
向声色大马。如以“小品圣手”名世的张岱,便在
《陶庵梦忆》中洋洋自得地夸口“越中清馋,无过
余者”,从北京的革婆果到台州的江瑶柱,从山西
的天花菜到临海的枕头瓜,大明两京一十三省的
美食竟被他尝了个遍。又如戏曲大家李渔,一边
醉心于梨园之乐,一边也不忘鲜衣美食这一类
“家居有事”,并在理论巨著《闲情偶寄》中加入
“饮馔”一部,系统阐述其“存原味、求真趣”的饮
食美学思想与“宗自然、尊鲜味”饮食文化观念
特殊的时代背景使得“饮食之人”不再被轻
贱,于是一大批美食家在清代前半叶应运而生:
在这一背景下,“食圣”袁枚登场了。
袁枚在《与薛寿鱼书》中公然提出“夫所谓不
朽者,非必周,孔而后不朽也
羿之射,秋之变
俞跗之医,皆可以不朽也”,而他自己则将饮食之
道视为堪与周公孔子之为相媲美的事业,因此可
以毫无顾忌地“每食于某氏而饱,必使家厨往彼
灶觚,执弟子之礼”。
袁枚作诗以“性灵说”为主张,认为诗应直抒
心灵,表达真意,这一主张也融合到了饮食中:他
认为在烹饪之前要了解食材、尊重物性,注意食
材间的搭配和时间把握;他反对铺张浪费,提出
看佳原不在钱多”,食材之美更在于物尽其用;
他将人文主义引入饮食,宣扬“物为人用,使之死
可也,使之求死不得不可也”。他强调烹饪理论
的重要性,以为中国烹法完全依厨人经验不利于
传承,为了给后世食客厨人树立典范,又煞费苦
心撰写出了《随园食单》--这部南北美食集大
成之作,再一次为中华美食的发展开启了新的
纪元。
《随园食单》之前,中国历代亦不乏饮食著
作,但关于制法的记述往往过于简略,如隋代《食
经》唐代《烧尾宴食单》之类甚至流于“报菜名
宋元以降,饮食著作的烹饪方法逐渐明晰,但亦
停留在“形而下”的层次。而《随园食单》则完成
了饮食文化从经验向理论的最终蜕变,如“须知
单”“戒单”中梳理了物性、作料、洗刷、调剂、搭
配、火候、器具、上菜等方方面面,“上菜须知”中
的“盐者宜先,淡者宜后;浓者宜先,薄者宜后
等,都是对中国千年烹饪经验一次开创性的总结
与编排
在袁枚和他的《随园食单》之后,中国饮食文
化从“形而上”的思想层面迈上了一个新台阶,在
之后的百余年里,帮口菜渐渐发达,“四大菜系’
大菜系”逐渐成形
摘编自江隐龙《中华尚食之道里
自有一个民族坚韧的初心》)
1.下列对文中“中华饮食文化”的相关理解,不正
确的一项是
A.中华饮食文化跟礼仪关系紧密,“夫礼之
初,始诸饮食”说明饮食活动从一开始就被
赋予礼仪要求
B.中华饮食文化发展的影响因素有很多,与国
家的强弱并不一致,而与历代文人士大夫
的态度有较大关联。
C.中华饮食文化发展中,唐代以前的文人很少
重视饮食,跟“君子谋道不谋食”的教诲和
“学而优则仕”的现实回报有关。
D.中华饮食文化在明清时代出现了“存原味、
求真趣”的饮食美学思想与“宗自然、尊鲜
味”的饮食文化观念,
2.下列说法符合原文意思的一项是
A.中国食客喜欢用“食不厌精,脍不厌细”标
榜中华美食之道,这八个字从一个侧面反
映了在孔子时代把饮食做到“精细”并非
B.两宋时期饮食风气发生了变化,产生了转
折,无论从个体角度还是从朝廷角度来看,
这都是经济比较发达造成的。
C.袁枚将自己的饮食之道当作与周公孔子的
饮食之道相媲美的不朽事业,饮宴饱食归
来,都派自己的厨子去对方家学习
D.袁枚把人文主义融入饮食,大致表现在这
样三方面:尊重物性,要了解食材;不要浪
费,要物尽其用;物为人用,要保护生命。
3.概括中华饮食文化得到发展的原因,
""",
"/home/liuxin/work/search_question/data/12677472-语文1-160.pdf.xlsx-题目-原图/1151417.jpg":"""(2024·全国甲,4~6)阅读下面的文字,完成
1~3题
偷梁换柱”多指以假代真,用欺骗的手段改
变事物的性质,然而在古建筑工程领域,偷梁换
柱”却属于一种科学实用的修缮加固方法。
梁是截面形状一般为长方形的木料,且木料
的长度尺寸远大于截面尺寸。梁为水平放置,两
端的底部有支撑构件。梁主要用于承担建筑上
部构件及屋顶的全部重量,并把这些重量向下传
给支撑构件。柱为梁的支撑构件。柱子截面形
状一般为圆形,长度尺寸远大于截面直径。柱子
为竖向放置,主要用于承担上部梁传来的重量,
并向下传递给下部的梁或直接传至地面。梁与
柱采用榫卯形式连接,形成稳固的大木结构体
系。位于屋架内的若干梁在竖向被层层往上
“抬”,上下梁之间由短柱支撑,底部的梁由立于
地面的立柱支撑。梁、柱均为中国木结构古建筑
的核心受力、传力构件,缺一不可。
对于古建筑而言,立于地面的立柱,或因长
期承受上部结构传来的重量而产生开裂残损,或
因柱底部位长期受到地面潮气影响而出现糟朽
残损,这导致木柱强度下降,无法正常支撑梁,
此时可采用“偷梁换柱”的加固方法。“偷梁换
柱”实际就是“托梁换柱”。其基本做法为:首先
将“假柱”即临时的竖向支撑构件)安装在梁底
部、原柱(原有立柱)旁边;再抽去原柱,使梁传来
的重量暂时由“假柱”承担;然后安装新柱,新柱
的材料、尺寸及安装位置与原有立柱相同;最后
将“假柱”移去。
完善的“偷梁换柱”加固方法具有科学性,其
原理主要包括三个方面:其一,从梁的角度而言,
它是水平受力构件,并把外力向下传给立柱。梁
只有保持水平稳定状态,才能保证整个大木结构
的稳定。在加固古建筑的过程中,梁始终受到支
托,因而能一直保持水平稳定状态。其二,从柱
的角度而言,它是竖向支撑构件,并最终把上部
构件的重量传给地基。只有立柱具有充足的承载力,且与梁有可靠连接时,才能有效承担梁传
来的作用力。加固过程中,技术人员虽然将原柱
抽去,但是预先将“假柱”设置于原柱附近,让“假
柱”代替原柱发挥支撑作用,因而换柱过程对结
构整体的稳定基本无影响。换柱完成后,新柱与
原柱有着同样的材料、尺寸,且与梁有着相同的
可靠连接方式,它完全能够代替原柱发挥支撑作
用。其三,从梁、柱整体结构角度而言,“偷梁换
柱”方法对整体结构干扰小,且能达到良好的加
固效果:原柱被新柱原位替换,新柱不仅有很好
的支撑作用,而且与梁仍有可靠连接;“假柱”仅
用于加固过程的临时支撑,且在原柱撤去后新柱
安装前,能够与梁临时组成稳定的结构体系。因
此,在“偷梁换柱”过程中,梁、柱结构整体始终处
于稳定状态 1.下列对原文相关内容的理解和分析,不正确的
一项是
A.“偷梁换柱”这一成语在现今的使用中多含
有贬义的色彩,但在古建筑工程领域,它是
指一种修缮加固的科学方法,完全没有
贬义。
B.中国古建筑大木构架剖面示意图展示了几
种不同位置、不同尺寸的柱,这些柱子中:
立于地面的立柱比较容易发生糟朽残损的
情况。
C.结合图文可以发现,屋顶的重量由上层柱承
担,然后传给梁,再由梁传递给其下的短
柱,依次向下传递,最终由底部的立柱传至
地面。
D.“偷梁换柱”的加固方法包括托梁、抽柱、换
柱等步骤,在每一个步骤中梁一直会得到
很好的支撑,从而始终能够保持水平稳定
状态。
★2.请根据原文内容,在下面文段的横线处补写
出恰当的词语。
工程实例:故宫太和殿是我国最大的木构
大殿,明清两代帝王即位或节日庆典都在此举
行。2004年,技术人员在对太和殿进行勘查
时,发现有一根立柱下部三分之一的位置出现
了严重糟朽,于是采取了“偷梁换柱”的方法对
该立柱进行加固。具体过程如下:先使用“假
柱”托住原柱上部的梁。“假柱”为完好的木
料,被安装在
附近,用于临时支撑
梁。再把柱子底部糟朽部分抽去,以便用
代替。原柱糟朽部分去掉后,剩余的部
分做成巴掌形,与新柱搭接。新柱与被抽去的
糟朽部分同材料、同形状、同尺寸,且顶部亦做
成巴掌榫形状。最后再把
拆除,即
完成了原有立柱的加固,
3.清代的古籍中有另一种“偷梁换柱”的记载:当
某根立柱损坏需要更换时,为节省工料,工匠
只是在原柱旁边设一根新柱,再撤去原柱。为
什么第2题“工程实例”中,太和殿修缮没有采用这种更简便的加固方式呢?请简要分析"""}
# 合合ocr
def recognize_pdf2md( image_path, options=None, is_url=False):
"""
pdf to markdown 官网:https://www.textin.com/document/pdf_to_markdown
:param options: request params
:param image_path: string
:param is_url: bool
:return: response
options = {
'pdf_pwd': None,
'dpi': 144, # 设置dpi为144
'page_start': 0,
'page_count': 1000, # 设置解析的页数为1000页
'apply_document_tree': 0,
'markdown_details': 1,
'page_details': 0, # 不包含页面细节信息
'table_flavor': 'md',
'get_image': 'none',
'parse_mode': 'scan', # 解析模式设为scan
}
图像宽高需介于20到10000像素之间
"""
_app_id = "2bae9d892b488e1d29f4ca0b650ad7a5"
_app_secret = "46d25182d74cb02b9fb8a06734fb73a4"
host = 'https://api.textin.com'
url = host + '/ai/service/v1/pdf_to_markdown'
if not options:
options = {
'table_flavor': 'none', # 不进行表格识别,把表格图像当成普通文字段落来识别
'parse_mode': 'scan', # 设置解析模式为scan模式
'page_details': 0, # 包含页面细节
'markdown_details': 0,
'apply_document_tree': 1,
'raw_ocr': 1,
}
headers = {
'x-ti-app-id': _app_id,
'x-ti-secret-code': _app_secret
}
if is_url:
image = image_path
headers['Content-Type'] = 'text/plain'
else:
image = get_file_content(image_path)
headers['Content-Type'] = 'application/octet-stream'
count = 0
res = {"result":{"markdown":""}}
while count < 5:
try:
resp = requests.post(url, data=image, headers=headers, params=options)
res = resp.text
res = json.loads(res)
text = res['result']['markdown']
break
except Exception as e:
count += 1
if image_path in ocr_example:
res = {"result": {"markdown": ocr_example[image_path]}}
break
print(f"合合ocr error:{image_path}; {e}")
res = json.dumps(res, ensure_ascii=False)
return res
def image_to_base64( image_path: str) -> str:
# 读入图片转为base64位数据
with open(image_path, "rb") as f:
image_bytes = f.read()
# 获取图片格式(如jpg/png)
image_format = os.path.splitext(image_path)[1].lower().strip(".")
# 拼接豆包要求的Base64格式
image_base64 = f"data:image/{image_format};base64,{base64.b64encode(image_bytes).decode('utf-8')}"
return image_base64
def doubao_image_embedding( image_file, text):
# 豆包图文 embedding 模型; 官网:https://www.volcengine.com/docs/82379/1523520
doubao_client = Ark(api_key="35f1674f-22b4-434b-9a8d-0d80e8d1ef6b") # 豆包 图文embedding模型
base64_data = image_to_base64(image_file)
count = 0
while count < 5:
try:
text = text[:4096]
resp = doubao_client.multimodal_embeddings.create(
model="doubao-embedding-vision-250615", # "doubao-embedding-vision-250615" embed_dim=2048
encoding_format="float",
dimensions= 1024,
input=[{"text":text,"type":"text"}, {"image_url": {"url": base64_data}, "type": "image_url"}]
)
image_embedding = resp.data['embedding']
break
except Exception as e:
count += 1
print(r"doubao embedding error: ", end=" ")
print(image_file)
pass
return image_embedding
def doubao_text_embedding(text):
# # 豆包文本 embedding 模型
doubao_client = Ark(api_key="35f1674f-22b4-434b-9a8d-0d80e8d1ef6b") # 豆包 图文embedding模型
text = text[:4096]
count = 0
while count < 5:
try:
resp = doubao_client.multimodal_embeddings.create(
model="doubao-embedding-vision-250615", # "doubao-embedding-vision-250615" embed_dim=2048
encoding_format="float",
dimensions=1024,
input=[{"text": text, "type": "text"}]
)
text_embedding = resp.data['embedding']
except Exception as e:
print(f"doubao text embedding error:{e}; text: {text}" )
count += 1
return text_embedding
def monkey_ocr(file_path):
url = 'http://61.170.32.8:38878/ocr/text'
files = { "file": (open(file_path, "rb")) }
response = requests.post(
url=url,
files=files,
)
result = response.json()
return result
if __name__ == "__main__":
start_time = time.time()
image = "/home/liuxin/work/search_question/data/人工标注的手机拍题图片/3-1151454.jpg"
image = "/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册/93.jpg"
# image = "/home/liuxin/work/search_question/data/12677472-语文1-160.pdf.xlsx-题目-原图/1151411.jpg" # 图像尺寸大于10000,超过合合的阈值
image= "/home/liuxin/work/search_question/data/人工标注的手机拍题图片/3-1151454.jpg"
image = "/home/liuxin/work/search_question/data/人工标注的手机拍题图片/4-1151471.jpg"
image = "/home/liuxin/work/search_question/data/人工标注的手机拍题图片/7-1151505.jpg"
image= "/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册/93.jpg"
#### 1、合合ocr:传输文件
resp = recognize_pdf2md(image, {
'table_flavor': 'none', # 不进行表格识别,把表格图像当成普通文字段落来识别
'parse_mode': 'scan', # 设置解析模式为scan模式
'page_details': 0, # 包含页面细节
'markdown_details': 0,
'apply_document_tree': 1,
'raw_ocr': 1,
})
print(f"耗时:{time.time()-start_time}")
result = json.loads(resp)
# result = json.dumps(result, ensure_ascii=False, indent=4)
text = result['result']['markdown']
print("合合ocr")
print(text)
#
# print("\n\n")
#
#
# image = "/home/liuxin/work/search_question/data/12677472-语文1-160.pdf.xlsx-题目-原图/1151411.jpg"
# res = doubao_image_embedding(image, text)
# print(res[:5])
#
# res = doubao_text_embedding(text) # 接口调不通
# print(res[:5])
res = monkey_ocr(image)
print("monkey ocr:")
print(res)
print("\n.finished.")
import copy
from datetime import datetime
import json,re, os
import pandas as pd
from PIL import Image
import requests
import base64
from tritonclient import grpc as triton_grpc
from paddle_ocr import infer_paddle_ocr
from tools import recognize_pdf2md # 合合ocr
from volcenginesdkarkruntime import Ark # pip install -U volcengine-python-sdk[ark]
from api_tst import get_page_image_url
__path__ = os.path.dirname(os.path.abspath(__file__))
class AddPaddleOCR():
# 给每张整图添加paddleOCR的结果,给每个题目添加paddleOCR结果
def __init__(self):
paddle_ocr_url = '61.170.32.8:38896'
self.client_ocr = triton_grpc.InferenceServerClient(paddle_ocr_url) # 李亚泽部署的paddleOCR
self.doubao_client = Ark(api_key="35f1674f-22b4-434b-9a8d-0d80e8d1ef6b") # 豆包 图文embedding模型
def read_excel(self, excel_file, sheet_name="题库数据"):
data = pd.read_excel(excel_file, sheet_name)
data.fillna("")
data = data.to_dict(orient='records')
return data
def save_excel(self, data_list, excel_file, sheet_name="题库数据" ):
data2 = pd.DataFrame(data_list)
data2.to_excel(excel_file, index=False, sheet_name=sheet_name)
print("保存数据成功:", excel_file)
def download_image(self, image_url, save_path):
# 发送HTTP请求获取图片内容
response = requests.get(image_url, stream=True, timeout=10)
# 检查请求是否成功
response.raise_for_status()
# 确保保存路径的目录存在
directory = os.path.dirname(save_path)
if not os.path.exists(directory):
os.makedirs(directory)
# 写入文件
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
return True
def image_to_base64(self, image_path: str) -> str:
# 读入图片转为base64位数据
with open(image_path, "rb") as f:
image_bytes = f.read()
# 获取图片格式(如jpg/png)
image_format = os.path.splitext(image_path)[1].lower().strip(".")
# 拼接豆包要求的Base64格式
image_base64 = f"data:image/{image_format};base64,{base64.b64encode(image_bytes).decode('utf-8')}"
return image_base64
def doubao_image_embedding(self, image_file, text):
# 豆包图文 embedding 模型; 官网:https://www.volcengine.com/docs/82379/1523520
base64_data = self.image_to_base64(image_file)
text = text[:4096]
resp = self.doubao_client.multimodal_embeddings.create(
model="doubao-embedding-vision-250615", # "doubao-embedding-vision-250615" embed_dim=2048
encoding_format="float",
dimensions= 1024,
input=[{"text":text,"type":"text"}, {"image_url": {"url": base64_data}, "type": "image_url"}]
)
image_embedding = resp.data['embedding']
return image_embedding
def doubao_text_embedding(self, text):
# # 豆包文本 embedding 模型
text = text[:4096]
resp = self.doubao_client.multimodal_embeddings.create(
model="doubao-embedding-vision-250615", # "doubao-embedding-vision-250615" embed_dim=2048
encoding_format="float",
dimensions=1024,
input=[{"text": text, "type": "text"}]
)
text_embedding = resp.data['embedding']
return text_embedding
def paddle_ocr(self, file):
# im = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册/1.jpg"
im = Image.open(file).convert('RGB')
db_boxes, rec_texts = infer_paddle_ocr(im, self.client_ocr)
paddle_ocr = {"db_boxes":db_boxes, "rec_texts":rec_texts}
paddle_ocr = json.dumps(paddle_ocr, ensure_ascii=False)
return paddle_ocr
def get_page_image_url(self, bankId):
# 获取整页图片的url
url = "https://rays7.5rs.me/matrix/v1.0/aIRecognized/getPageInfoByBankId"
headers = {
"token": "whlg2025!",
"Only-For-Backend": "y"
}
data = { "bankId": bankId }
response = requests.get(url, headers=headers, params=data)
response = response.text
sourceImageUrl = json.loads(response)['data']['sourceImageUrl']
return sourceImageUrl
def add_imageUrl_and_paddleOCR(self, excel_path,book_id, book_name):
# 输入 从拆书平台下载的excel数据,添加标准图片的url列 "source_image_url" 列,添加标准图片paddleOCR的结果 “page_ocr”列
sheet_name = "题库数据"
data2 = pd.read_excel(excel_path, sheet_name)
data2.fillna("")
# 获取每张标准图片的 image_url
question_ids = data2['题目id']
page_ids = data2['电子页码']
data2 = data2.to_dict(orient='records')
# 1、获取每个题目图片的paddleOCR结果
question_save_path = os.path.join(__path__, "data", str(book_id) + "-" + book_name + "-题目-原图") # 标准题目原图保存文件夹
os.makedirs(question_save_path, exist_ok=True)
for line in data2:
question_image_url = line['题目图片地址']
question_id = str(line['题目id'])
question_save_file = os.path.join(question_save_path, question_id+".jpg")
print("question_save_file: ", question_save_file)
flag = self.download_image(question_image_url, question_save_file)
# # question_paddle_ocr = self.paddle_ocr(question_save_file) # paddleOCR
try:
question_paddle_ocr = json.loads(recognize_pdf2md(question_save_file))['result']['markdown'] # 合合OCR;图像宽高需介于20到10000像素之间
except:
question_paddle_ocr = line['题目文本']
line['question_ocr']=question_paddle_ocr
print("获取每个题目的OCR结束。\n")
# 2、获取整图的url
page_ids2source_image_url = {} # 每张标准图片(整图)的url地址
for page_id, question_id in zip(page_ids,question_ids):
if page_id not in page_ids2source_image_url.keys():
source_image_url = get_page_image_url( question_id)
page_ids2source_image_url[page_id] = {"source_image_url":source_image_url}
print("获取 所有图片(整图)URL结束\n")
# 3、获取每张标准图片的(整图) paddleOCR结果
save_path = os.path.join(__path__, "data", str(book_id)+"-"+book_name+"-原图") # 标准原图(整图)保存文件夹
os.makedirs(save_path, exist_ok=True)
for page_id, source_image_url in page_ids2source_image_url.items():
source_image_url = source_image_url['source_image_url']
save_file = os.path.join(save_path, f"{page_id}.jpg")
print(save_file)
flag = self.download_image( source_image_url, save_file)
# page_ocr = self.paddle_ocr(save_file)
page_ocr = json.loads(recognize_pdf2md(save_file))['result']['markdown'] # 合合OCR
page_ids2source_image_url[page_id]['page_ocr'] = page_ocr
# 拆书平台下载的 excel数据添加 source_image_url 列,和 “page_ocr”列。
for line in data2:
page_id = line['电子页码']
source_image_url = page_ids2source_image_url[page_id]['source_image_url']
paddle_ocr = page_ids2source_image_url[page_id]['page_ocr']
line['source_image_url'] =source_image_url
line['page_ocr'] = paddle_ocr
return data2
def add_page_image_embedding(self, data_list, image_path):
# 添加整页图片 embedding
page_id2page_embedding = dict()
for line in data_list:
page_id = line['电子页码']
if page_id not in page_id2page_embedding.keys():
page_ocr = line['page_ocr']
image_file = os.path.join(image_path, f"{page_id}.jpg")
page_image_embedding = self.doubao_image_embedding(image_file, page_ocr)
line['page_image_embedding'] = json.dumps(page_image_embedding)
page_id2page_embedding[page_id] = page_image_embedding
else:
page_image_embedding = page_id2page_embedding[page_id]
line['page_image_embedding'] = json.dumps(page_image_embedding)
return data_list
def add_question_image_embedding(self, data_list, image_path):
# 添加 题目原图 embedding
for line in data_list:
bankId = line['题目id']
image_file = os.path.join(image_path, str(bankId) + ".jpg") # 题目图片文件
image_text = line['question_ocr']
question_image_embedding = self.doubao_image_embedding(image_file, image_text)
line['question_image_embedding'] = json.dumps(question_image_embedding)
return data_list
def get_excel_name(self, path, save_down_excel_path):
# 1、已经解析处理预处理的图书(ocr 和向量)
save_files = os.listdir(save_down_excel_path)
save_files = [item[:-5] for item in save_files if item.endswith('xlsx')]
# 2、从拆书平台下载的图书,还没有进行预处理
file_names = os.listdir(path)
file_names = [item for item in file_names if item.endswith('xlsx') and item not in save_files]
book_ids = [item.split("-")[0] for item in file_names]
book_names = ["-".join(item.split("-")[1:]) for item in file_names]
data = []
for file, book_id, book_name in zip(file_names, book_ids, book_names):
file = os.path.join(path, file)
temp = [file, book_id, book_name]
data.append(temp)
print(f"拆书平台下载了{len(data)}个excel文件。")
return data
def result(self):
# 从拆书平台下载excel数据,添加 图片url列,添加整页图片 paddleOCR结果列;添加题目paddleOCR列,添加以上三列数据后请求api接口将excel插入es数据库
# down_excel_data = [excel_path,book_id, book_name]
down_excel_data = [
# ["/home/liuxin/work/search_book/data/12668851-高二暑假作业化学.pdf.xlsx", "12668851", "高二暑假作业化学"] ,
# ["/home/liuxin/work/search_book/data/12677035-2025《创新教程》新高考试题精选生物学.pdf.xlsx", "12677035", "2025《创新教程》新高考试题精选生物学"],
# ["/home/liuxin/work/search_book/data/12677471-卷1-132.pdf.xlsx", "12677471", "卷1-132"],
# ["/home/liuxin/work/search_book/data/12677472-语文1-160.pdf.xlsx", "12677472", "语文1-160"],
# ["/home/liuxin/work/search_question/data/12663121-衔接教材 地理.pdf.xlsx", "12663121", "衔接教材地理"],
# # ["/home/liuxin/work/search_question/data/12667382-25春北师一数暑假作业.pdf.xlsx", "12667382", "25春北师一数暑假作业"],
# ["/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册(出血文件).pdf.xlsx", "12664728", "课计划·七年级英语·RJ·上册(出血文件)"],
#
# ["/home/liuxin/work/search_question/data/12671977-暑假生活·学期总复习八年级物理通用版6印大度转曲PDF已加码 2025.6.12.pdf.xlsx", "12671977", "暑假生活·学期总复习八年级物理通用版6"],
# ["/home/liuxin/work/search_question/data/12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8.pdf.xlsx", "12670279", "12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8"],
]
down_excel_path = "/home/liuxin/work/search_question/data/拆书平台下载excel文件"
save_down_excel_path = "/home/liuxin/work/search_question/data/拆书平台下载excel文件_添加字段内容"
down_excel_data = self.get_excel_name(down_excel_path, save_down_excel_path)
for item in down_excel_data:
excel_path = item[0]
book_id = item[1]
book_name = item[2]
print(f"excel_path: {excel_path}, book_id: {book_id}, book_name: {book_name}")
try:
question_image_save_path = os.path.join(__path__, "data", str(book_id) + "-" + book_name + "-题目-原图") # 标准题目原图保存文件夹
page_image_save_path = os.path.join(__path__, "data", str(book_id) + "-" + book_name + "-原图") # 标准原图(整图)保存文件夹
data_list = AddPaddleocr.add_imageUrl_and_paddleOCR(excel_path, book_id, book_name )
data_list = self.add_page_image_embedding(data_list, page_image_save_path) # 添加整页原图的 embedding
print("添加 page embedding 完成。")
data_list = self.add_question_image_embedding(data_list, question_image_save_path) # 添加题目原图的 embedding
print("添加 question embedding 完成。")
save_excel_file = os.path.join(page_image_save_path, str(book_id) + "-" + book_name + ".xlsx") # 保存excel文件
self.save_excel(data_list, save_excel_file)
save_excel_file = os.path.join(save_down_excel_path, str(book_id) + "-" + book_name + ".xlsx") # 保存excel文件
self.save_excel(data_list, save_excel_file)
except Exception as e:
print(f"error {e}")
print(item)
# 从拆书平台下载excel数据,添加 图片url列,添加整页图片 paddleOCR结果列;添加题目paddleOCR列,添加以上三列数据后请求api接口将excel插入es数据库
AddPaddleocr = AddPaddleOCR()
# AddPaddleocr.result()
# image_path = ""
# image_embedding = AddPaddleocr.doubao_image_embedding( image_path)
# 测试搜整页图片、搜题目的效果
class Test():
def __init__(self):
paddle_ocr_url = '61.170.32.8:38896'
self.client_ocr = triton_grpc.InferenceServerClient(paddle_ocr_url)
self.doubao_client = Ark(api_key="35f1674f-22b4-434b-9a8d-0d80e8d1ef6b") # 豆包 图文embedding模型
pass
def image_to_base64(self, image_path: str) -> str:
# 读入图片转为base64位数据
with open(image_path, "rb") as f:
image_bytes = f.read()
# 获取图片格式(如jpg/png)
image_format = os.path.splitext(image_path)[1].lower().strip(".")
# 拼接豆包要求的Base64格式
image_base64 = f"data:image/{image_format};base64,{base64.b64encode(image_bytes).decode('utf-8')}"
return image_base64
def download_image(self, image_url, save_path):
count = 0
while count < 5:
try:
# 发送HTTP请求获取图片内容
response = requests.get(image_url, stream=True, timeout=100)
# 检查请求是否成功
response.raise_for_status()
# 确保保存路径的目录存在
directory = os.path.dirname(save_path)
if not os.path.exists(directory):
os.makedirs(directory)
# 写入文件
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
break
except:
count += 1
pass
return True
def doubao_image_embedding(self, image_file, text):
# 豆包图文 embedding 模型; 官网:https://www.volcengine.com/docs/82379/1523520
count = 0
while count < 5:
try:
base64_data = self.image_to_base64(image_file)
text = text[:4096]
resp = self.doubao_client.multimodal_embeddings.create(
model="doubao-embedding-vision-250615", # "doubao-embedding-vision-250615" embed_dim=2048
encoding_format="float",
dimensions= 1024,
input=[{"text":text,"type":"text"}, {"image_url": {"url": base64_data}, "type": "image_url"}]
)
image_embedding = resp.data['embedding']
break
except Exception as e:
count += 1
pass
return image_embedding
def doubao_text_embedding(self, text):
# # 豆包文本 embedding 模型
text = text[:4096]
resp = self.doubao_client.multimodal_embeddings.create(
model="doubao-embedding-vision-250615", # "doubao-embedding-vision-250615" embed_dim=2048
encoding_format="float",
dimensions=1024,
input=[{"text": text, "type": "text"}]
)
text_embedding = resp.data['embedding']
return text_embedding
def read_excel(self, excel_file, sheet_name="题库数据"):
data = pd.read_excel(excel_file, sheet_name)
data.fillna("")
data = data.to_dict(orient='records')
# data = data[:10]
print(f"成功读入excel数据:{excel_file};len(data):{len(data)}")
return data
def read_excel_dir_anno_data(self, es_data_path, anno_data_path, save_phone_image_path):
# 入es库的数据:es_data_path; 标注的题目数据图片:anno_data_path; 下载用户手机拍摄的题目图片 save_phone_image_path
# 1、从excel中读入 写入es库的全量数据
bank_id2es_data = {}
es_excel_names = os.listdir(es_data_path)
es_excel_names = [name for name in es_excel_names if name.endswith('.xlsx')]
sheet_name = '题库数据'
for excel_name in es_excel_names:
book_id = excel_name.split("-")[0]
excel_file = os.path.join(es_data_path, excel_name)
data = pd.read_excel(excel_file, sheet_name)
data.fillna("")
data = data.to_dict(orient='records')
for line in data:
line['book_id'] = str(book_id)
bank_id = str(line['题目id'])
line.pop('page_image_embedding',"")
line.pop('question_image_embedding', "")
line.pop('序号', "")
bank_id2es_data[bank_id] = line
print("读入所有插入es库的数据 success")
# 2、读入标注的所有题目数据
save_data =[]
anno_excel_names = os.listdir(anno_data_path)
anno_excel_names = [name for name in anno_excel_names if name.endswith('.xlsx')]
image_name_all = os.listdir(save_phone_image_path)
image_name_all = [name for name in image_name_all if name.endswith('.jpg')]
image_name_all = set(image_name_all)
sheet_name = '题库数据'
for excel_name in anno_excel_names:
excel_file = os.path.join(anno_data_path, excel_name)
data = pd.read_excel(excel_file, sheet_name)
data.fillna("")
data = data.to_dict(orient='records')
for line in data:
bank_id = str(line['题目id'])
phone_image_url = line['题目图片地址']
if bank_id in bank_id2es_data.keys():
es_line = bank_id2es_data[bank_id]
page_id = es_line['电子页码']
book_id = es_line['book_id']
image_url = es_line['题目图片地址']
image_name = f"{page_id}-{bank_id}.jpg"
if image_name not in image_name_all:
phone_image_file = os.path.join(save_phone_image_path, image_name )
flag = self.download_image( phone_image_url, phone_image_file) # 下载用户拍摄的图片
temp = {"book_id":book_id, "电子页码": page_id, "题目id": bank_id, "题目图片地址": image_url, "题目图片地址phone": phone_image_url}
save_data.append(temp)
else:
print(f"excel_name:{excel_name}; bank_id: {bank_id} 不在es库中")
print(f"人工标注了{len(save_data)}张手机拍摄的题目图片数据。")
return save_data
def save_excel(self, data_list_list, excel_file ):
# 一个excel文件保存多个 sheet
with pd.ExcelWriter(excel_file) as writer:
for index, data_list in enumerate(data_list_list):
data = pd.DataFrame(data_list)
data.to_excel(writer, sheet_name=f"sheet{index}", index=False)
print("保存数据成功:", excel_file)
def paddle_ocr(self, file):
# im = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册/1.jpg"
im = Image.open(file).convert('RGB')
db_boxes, rec_texts = infer_paddle_ocr(im, self.client_ocr)
paddle_ocr = {"db_boxes":db_boxes, "rec_texts":rec_texts}
paddle_ocr = json.dumps(paddle_ocr, ensure_ascii=False)
paddle_ocr_text = " ".join(rec_texts)
return paddle_ocr, paddle_ocr_text
def es_search_page(self, book_id, image_ocr):
# es搜索整图的 source_image_url
url = 'http://localhost:31001/search_page'
url = 'http://61.170.32.8:31001/search_page'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": str(book_id),
"image_ocr": image_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data)
# 输出响应结果
# print(f"状态码: {response.status_code}")
res = response.json()
source_image_url = res['es_search'][0]['_source']['source_image_url']
score = res['es_search'][0]['_score']
return source_image_url, score
def es_search_question(self, book_id, question_ocr, image_ocr=""):
# es 搜索题目的 bankId
url = 'http://localhost:31001/search_question_text'
url = 'http://61.170.32.8:31001/search_question_text'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": str(book_id),
"image_ocr": image_ocr,
"question_ocr": question_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data)
# 输出响应结果
# print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
score = res['es_search'][0]['_score']
return bankId, score
def es_search_question_embedding(self, book_id, question_embedding):
# 4、搜题(只用题目的ocr和图片的embedding数据)
url = 'http://localhost:31001/search_question_embedding'
# url = 'http://61.170.32.8:31001/search_question_embedding'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": str(book_id),
"question_embedding": json.dumps(question_embedding),
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data)
# 输出响应结果
# print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
score = res['es_search'][0]['_score']
return bankId, score
def search_question_text_and_embedding(self, book_id, question_ocr, question_embedding):
# 5、搜题(使用题目ocr的文本搜题,再使用题目ocr和图片整体的向量搜题,联合两个搜索的得分)
url = 'http://localhost:31001/search_question_text_and_embedding'
# url = 'http://61.170.32.8:31001/search_question_text_and_embedding'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": str(book_id),
"question_ocr": question_ocr,
"question_embedding": json.dumps(question_embedding),
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data)
# 输出响应结果
# print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
score = res['es_search'][0]['_score']
return bankId, score
def es_search_page_list(self, book_id, data_list, page_image_path):
# 输入手机拍摄的整图,es搜索库中的对应图片。手机拍摄的所有图片文件夹 page_image_path
save_data = []
page_ids = []
true_count = 0 # 搜整图正确结果统计
false_count = 0 # 搜整图错误结果统计
score_all = 0
page_id2item={}
for line in data_list:
page_id = line['电子页码']
if page_id in page_ids:
continue
line_temp = copy.deepcopy(line)
if page_id not in page_id2item:
image_file = os.path.join(page_image_path, str(page_id)+".jpg")
# paddle_ocr, search_page_ocr = self.paddle_ocr(image_file) # paddleOCR
search_page_ocr = json.loads(recognize_pdf2md(image_file))['result']['markdown'] # 合合OCR
line_temp["search_page_ocr"] = search_page_ocr
if 'book_id' in line:
book_id = line['book_id']
search_page_url, score = self.es_search_page( book_id, search_page_ocr)
score_all += score
line_temp["search_page_url"] = search_page_url
line_temp["search_page_score"] = score
page_id2item[page_id] = {"search_page_ocr":search_page_ocr, "search_page_url":search_page_url, "search_page_score":score}
else:
item = page_id2item[page_id]
search_page_url = item['search_page_url']
search_page_score = item['search_page_score']
search_page_ocr = item['search_page_ocr']
line_temp["search_page_ocr"] = search_page_ocr
line_temp["search_page_url"] = search_page_url
line_temp["search_page_score"] = search_page_score
if search_page_url == line_temp['source_image_url']:
flag = 1
true_count += 1
else:
flag = 0
false_count += 1
line_temp["flag"] = flag
save_data.append(line_temp)
mean_score = score_all/len(data_list)
print(f"图书 book_id:{book_id},搜整图正确{true_count}张图;错误{false_count}张图。es平均得分{mean_score}")
return save_data
def es_search_page_question_list(self, book_id, data_list, question_image_path, image_path): # 丢弃
# 使用题目图片和整页图片 搜索es库中对应题目。拍摄的题目图片文件夹 question_image_path。 拍摄的整页图片文件夹 image_path
true_count = 0 # 搜题正确结果统计
false_count = 0 # 搜题错误结果统计
score_all = 0
page_id2ocr = {}
for line in data_list:
page_id = line['电子页码']
bank_id = line['题目id']
if page_id not in page_id2ocr.keys():
image_file = os.path.join(image_path, str(page_id) + ".jpg")
paddle_ocr, paddle_ocr_text = self.paddle_ocr(image_file)
page_id2ocr[page_id] = paddle_ocr_text
question_image_file = os.path.join(question_image_path, f"{page_id}-{bank_id}.jpg")
paddle_ocr, question_ocr = self.paddle_ocr(question_image_file)
line["search_question_ocr"] = question_ocr
line["search_page_ocr"] = page_id2ocr[page_id]
try:
if 'book_id' in line:
book_id = line['book_id']
search_bankId, search_score = self.es_search_question( book_id, question_ocr, page_id2ocr[page_id])
score_all += search_score
except Exception as e:
print(f"es_search_page_question_list error {e}")
print(question_image_file)
search_bankId = "error"
search_score = "error"
line['question_page_search_bankId'] = search_bankId
line['question_page_search_score'] = search_score
if str(search_bankId) == str(bank_id):
flag = 1
true_count += 1
else:
flag = 0
false_count += 1
line['question_page_flag'] = flag
if (true_count+false_count) % 100 == 0 and (true_count+false_count) != 0:
print(f"输入整图和题目OCR,搜整题正确{true_count}张图;错误{false_count}张图。正确率:{true_count / (true_count + false_count)}")
mean_score = score_all/len(data_list)
print(f"图书 book_id:{book_id},输入整图和题目OCR,搜整题正确{true_count}张图;错误{false_count}张图。es平均得分:{mean_score}")
return data_list
def es_search_question_list(self, book_id, data_list, question_image_path):
# 只使用题目图片ocr搜索es库中对应题目。拍摄的题目图片文件夹 question_image_path。
true_count = 0 # 搜题正确结果统计
false_count = 0 # 搜题错误结果统计
score_all = 0
for line in data_list:
page_id = line['电子页码']
bank_id = line['题目id']
question_image_file = os.path.join(question_image_path, f"{page_id}-{bank_id}.jpg") # 用户拍摄的题目图片
# paddle_ocr, question_ocr = self.paddle_ocr(question_image_file) # paddleOCR
question_ocr = json.loads(recognize_pdf2md(question_image_file))['result']['markdown'] # 合合OCR
try:
if 'book_id' in line:
book_id = line['book_id']
search_bankId, score = self.es_search_question( book_id, question_ocr)
except Exception as e:
search_bankId, score = "error", -1
print(f"es_search_question_list error :{e}")
print(question_image_file)
score_all += score
line['question_search_bankId'] = search_bankId
line["search_question_ocr"] = question_ocr
line["search_question_score"] = score
if str(search_bankId) == str(bank_id):
flag = 1
true_count += 1
else:
flag = 0
false_count += 1
line['question_flag'] = flag
if (true_count+false_count)%100==0 and (true_count+false_count)!=0:
print(f"只输入题目OCR,搜整题正确{true_count}张图;错误{false_count}张图。正确率:{true_count / (true_count + false_count)}")
print(f"图书 book_id:{book_id},只输入题目OCR,搜整题正确{true_count}张图;错误{false_count}张图。正确率:{true_count/(true_count+false_count)}")
return data_list
def es_search_question_embedding_list(self, book_id, data_list, question_image_path):
# 只使用向量搜题。拍摄的题目图片文件夹 question_image_path。
true_count = 0 # 搜题正确结果统计
false_count = 0 # 搜题错误结果统计
for line in data_list:
page_id = line['电子页码']
bank_id = line['题目id']
search_question_ocr = line["search_question_ocr"]
question_image_file = os.path.join(question_image_path, f"{page_id}-{bank_id}.jpg") # 用户拍摄的题目图片
question_embedding = self.doubao_image_embedding( question_image_file, search_question_ocr)
try:
if 'book_id' in line:
book_id = line['book_id']
search_bankId, score = self.es_search_question_embedding( book_id, question_embedding) # 只使用 题目图片的embedding搜题
except Exception as e:
search_bankId = "error"
score = "error"
print(f"es_search_question_embedding_list error {e}")
print(question_image_file)
line['question_search_embedding_bankId'] = search_bankId
line["search_question_embedding_score"] = score
if str(search_bankId) == str(bank_id):
flag = 1
true_count += 1
else:
flag = 0
false_count += 1
line['question_embedding_flag'] = flag
if (true_count+false_count) % 100 == 0 and (true_count+false_count) != 0:
print(f"只输入题目embedding,搜整题正确{true_count}张图;错误{false_count}张图。正确率:{true_count / (true_count + false_count)}")
print(f"图书 book_id:{book_id},只输入题目embedding,搜整题正确{true_count}张图;错误{false_count}张图。正确率:{true_count/(true_count+false_count)}")
return data_list
def es_search_question_text_and_embedding_list(self, book_id, data_list, question_image_path):
# 使用向量搜题,和es文本搜索,联合打分排序。拍摄的题目图片文件夹 question_image_path。
true_count = 0 # 搜题正确结果统计
false_count = 0 # 搜题错误结果统计
for line in data_list:
page_id = line['电子页码']
bank_id = line['题目id']
search_question_ocr = line["search_question_ocr"]
question_image_file = os.path.join(question_image_path, f"{page_id}-{bank_id}.jpg") # 用户拍摄的题目图片
question_embedding = self.doubao_image_embedding( question_image_file, search_question_ocr)
try:
if 'book_id' in line:
book_id = line['book_id']
search_bankId, score = self.search_question_text_and_embedding( book_id, search_question_ocr, question_embedding) # 使用题目图片embedding和 题目ocr文本联合搜索
except Exception as e:
search_bankId, score ='error' , 'error'
print(f"es_search_question_text_and_embedding_list error {e} ")
print(question_image_file)
line['question_search_embedding_and_text_bankId'] = search_bankId
line["search_question_embedding_and_text_score"] = score
if str(search_bankId) == str(bank_id):
flag = 1
true_count += 1
else:
flag = 0
false_count += 1
line['question_embedding_and_text_flag'] = flag
print(f"图书 book_id:{book_id},题目ocr和embedding联合搜索,搜整题正确{true_count}张图;错误{false_count}张图。正确率:{true_count/(true_count+false_count)}")
return data_list
def result1(self):
book_id = 12667382
question_image_path = "/home/liuxin/work/search_question/data/12667382-25春北师一数暑假作业-题目" # 用户拍摄的题目图片文件夹
page_image_path = "/home/liuxin/work/search_question/data/12667382-25春北师一数暑假作业" # 用户拍摄的整图文件夹
source_image_path = "/home/liuxin/work/search_question/data/12667382-25春北师一数暑假作业-原图" # 原图保存文件夹
excel_file = os.path.join(source_image_path, "12667382-25春北师一数暑假作业.xlsx")
data_list = self.read_excel(excel_file, sheet_name="题库数据")
# # page_data_list = self.es_search_page_list(book_id, data_list, page_image_path) # 搜整页图片
question_data_list = self.es_search_question_list( book_id, data_list, question_image_path) # es 文本直接搜题
# # # page_question_data_list = self.es_search_page_question_list(book_id, data_list, question_image_path, page_image_path) # 搜整页图片再搜题
question_data_list = self.es_search_question_embedding_list( book_id, question_data_list, question_image_path) # es 向量直接搜题
question_data_list = self.es_search_question_text_and_embedding_list( book_id, question_data_list, question_image_path) # 联合向量搜索和文本搜索
data_list_list = [
# page_data_list,
question_data_list,
]
timestamp_8 = int(datetime.now().strftime("%Y%m%d"))
save_file = os.path.join(source_image_path, f"{str(book_id)}-测试{str(timestamp_8)}.xlsx")
self.save_excel(data_list_list, save_file)
def result2(self):
# 只搜整页的图片
# book_id = 12664728 #
# page_image_path = "/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册" # 用户拍摄的整图文件夹
# source_image_path = "/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册(出血文件)-原图" # 原图保存文件夹
# excel_file = os.path.join(source_image_path, "12664728-课计划·七年级英语·RJ·上册(出血文件).xlsx")
# book_id = 12670279 #
# page_image_path = "/home/liuxin/work/search_question/data/12670279-地理" # 用户拍摄的整图文件夹
# source_image_path = "/home/liuxin/work/search_question/data/12670279-12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8-原图" # 原图保存文件夹
# excel_file = os.path.join(source_image_path, "12670279-12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8.xlsx")
book_id = 12667382 #
page_image_path = "/home/liuxin/work/search_question/data/12667382-25春北师一数暑假作业" # 用户拍摄的整图文件夹
source_image_path = "/home/liuxin/work/search_question/data/12667382-25春北师一数暑假作业-原图" # 原图保存文件夹
excel_file = os.path.join(source_image_path, "12667382-25春北师一数暑假作业.xlsx")
data_list = self.read_excel(excel_file, sheet_name="题库数据")
page_data_list = self.es_search_page_list(book_id, data_list, page_image_path) # 搜整页图片
data_list_list = [
page_data_list,
# question_data_list,
]
timestamp_8 = int(datetime.now().strftime("%Y%m%d"))
save_file = os.path.join(source_image_path, f"{str(book_id)}-测试整页{str(timestamp_8)}.xlsx")
self.save_excel(data_list_list, save_file)
def result3(self):
# 只搜题。 测试整个文件夹的多个excel题目标注数据
es_data_path = "/home/liuxin/work/search_question/data/拆书平台下载excel文件_添加字段内容" # 入es库的所有数据文件夹
anno_data_path = "/home/liuxin/work/search_question/data/人工标注的手机拍题excel" # 人工标注的题目数据图片
save_phone_image_path = "/home/liuxin/work/search_question/data/人工标注的手机拍题图片" # 下载的手机拍题图片
data_list = self.read_excel_dir_anno_data( es_data_path, anno_data_path, save_phone_image_path)
question_image_path = save_phone_image_path # 用户拍摄的题目图片文件夹
book_id = None
question_data_list = self.es_search_question_list( book_id, data_list, question_image_path) # es 文本直接搜题
question_data_list = self.es_search_question_embedding_list( book_id, question_data_list, question_image_path) # es 向量直接搜题
# question_data_list = self.es_search_question_text_and_embedding_list( book_id, question_data_list, question_image_path) # 联合向量搜索和文本搜索
data_list_list = [
# page_data_list,
question_data_list,
]
timestamp_8 = int(datetime.now().strftime("%Y%m%d"))
save_file = os.path.join(save_phone_image_path, f"搜题-测试{str(timestamp_8)}.xlsx")
self.save_excel(data_list_list, save_file)
if __name__ == "__main__":
test = Test()
# test.result2() # 测试搜整页图片的效果
test.result3() # 测试搜题的效果
# # 测试paddleOCR的效果
# image_file_list =[
# "/home/liuxin/work/search_question/data/12667382一年级数学-题目/19-764148.jpg",
# "/home/liuxin/work/search_question/data/12667382一年级数学-题目/19-764148.jpg.jpg",
# # "",
# # ""
# ]
# for image_file in image_file_list:
# paddle_ocr, paddle_ocr_text = test.paddle_ocr( image_file)
# print(image_file)
# print(paddle_ocr_text)
print("\nfinished.")
""" es 计算的分数利用词频信息,库里的数据不同 得分不同
图书 book_id:12664728,搜整图正确301张图;错误0张图。 合合ocr
图书 book_id:12670279,搜整图正确159张图;错误0张图。 合合ocr
搜题 es库里的数据和用户手机拍摄的数据都用paddleOCR
图书 book_id:12667382,只输入题目OCR,搜整题正确138张图;错误3张图。正确率:0.9787234042553191
图书 book_id:12667382,只输入题目embedding,搜整题正确141张图;错误0张图。正确率:1.0
图书 book_id:12667382,题目ocr和embedding联合搜索,搜整题正确140张图;错误1张图。正确率:0.9929078014184397
搜题 es库里的数据和用户手机拍摄的题目图片都用paddleOCR
只输入题目OCR,搜整题正确465张图;错误11张图。正确率:0.976890756302521
只输入题目embedding,搜整题正确476张图;错误0张图。正确率:1.0 (题目embedding:将题目ocr和图片作为一个整体输入embedding模型获取向量)
搜题 es库里的题目图片和用户手机拍摄的题目图片都用合合ocr
1、只输入题目OCR,搜整题正确649张图;错误3张图。正确率:0.995。
2、只输入题目embedding,搜整题正确651张图;错误1张图。正确率:0.998。(有一条数据标错了)
"""
# nohup python -u tst_search_result.py > tst_search_result.log 2>&1 &
# tail -f tst_search_result.log
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment