Commit 36d45731 by unknown

init

parent 8fc24340
hi
\ No newline at end of file
## 搜图搜书
#### 1、需求
依据用户拍摄习题图书整页图片 搜索图片库对应的相同图片。
服务器地址: 61.170.32.8
开发文件路径:/home/liuxin/work/search_book
conda虚拟环境:conda activate search_book
#### 2、搜索方案
```
准备工作:
1、将习题图书拆分为独立的一张张图片,
2、后端对整本书进行解析,得到每个题的ocr结果,以及整张图片的paddleOCR结果,整理为一个excel文档,通过接口上传进es数据库(excel文档样例:)
流程:
1、用户手机拍摄图书完整页面图片并上传
2、使用paddleOCR对用户拍摄的图片进行解析,解析结过直接concat为一个长字符串
3、搜书搜图api接口接收用户拍摄图片的长字符串,使用长字符串在es中搜索
4、返回es搜索的top 1数据内容。
```
#### 3、接口
```
# 1、excel数据插入es数据库接口
def insert_excel(data):
# 1、excel 数据插入es数据库
url = 'http://localhost:31000/upload/excel'
url = 'http://61.170.32.8:31000/upload/excel'
headers = { 'accept': 'application/json' }
file_path = data['file_path']
# 上传文件
with open(file_path, 'rb') as file:
files = {
'file': (file_path, file),
}
# 发送请求
response = requests.post(url, headers=headers, data=data, files=files)
# 输出响应结果
print(f"状态码: {response.status_code}")
print(f"响应内容: {response.json()}")
data = {'book_id': '12663121', 'book_name': '衔接教材地理', "file_path": '12663121-衔接教材 地理.pdf.xlsx' }
insert_excel(data)
```
```
# 2、依据 book_id 删除es库中整本书所有数据 接口
def delete_es_book(book_id="12663121"):
# 2、指定book_id 删除es中对应的图书数据
url = 'http://localhost:31000/delete_book'
url = 'http://61.170.32.8:31000/delete_book'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
res = json.dumps(res, ensure_ascii=False, indent=4)
print(res)
book_id = "12667382"
delete_es_book(book_id)
```
```
# 3、搜图搜题接口
def search_book_question(book_id, image_ocr):
# 3、搜书搜题
url = 'http://localhost:31000/search_book_quesiton'
url = 'http://61.170.32.8:31000/search_book_quesiton'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"image_ocr": image_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['questions'][0]['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
book_id = "12664728"
image_ocr = "用户拍摄图片整页的ocr结果"
res = search_book_question(book_id, image_ocr)
print(res)
```
#### 4、启动服务
```
# 61.170.32.8 服务器
$ cd /home/liuxin/work/search_book
$ conda activate search_book
$ nohup python -u api_service.py > api_service.log 2>&1 &
$ tail -f log/search_book.log
```
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
import os
from typing import Optional
from pathlib import Path
from pydantic import BaseModel
import uvicorn
import logging
from save_es_database import EsHelper
from prepare_data import prepareData
__path__ = os.path.dirname(os.path.abspath(__file__))
pre_data = prepareData() # 准备写入es的图书数据
app = FastAPI(title="搜书搜题")
es = EsHelper()
index_name = "search_book_and_question" # rays 搜书搜题
# 配置上传文件夹和允许的文件类型
UPLOAD_DIR = Path("uploads")
ALLOWED_EXTENSIONS = {"xlsx"}
print("UPLOAD_DIR: ", UPLOAD_DIR)
# 确保上传文件夹存在
UPLOAD_DIR.mkdir(exist_ok=True)
# 定义响应模型
class UploadResponse(BaseModel):
status: str
message: str
filename: str
file_path: str
book_id: str
book_name: str
# 搜书搜题请求数据结构
class SearchBook(BaseModel):
book_id: str
image_ocr: str
top_k: int = 1
class DeleteBook(BaseModel):
book_id: str
# 检查文件扩展名是否允许
def allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
def setup_logger():
# 创建logger对象(单例,按名称唯一)
logger = logging.getLogger('search_book')
# 避免重复初始化:如果已有处理器,直接返回
if logger.handlers:
return logger
logger.setLevel(logging.INFO)
# 禁用日志传播到父Logger
logger.propagate = False
# 日志目录路径(使用脚本所在目录,替代__path__)
current_dir = os.path.dirname(os.path.abspath(__file__))
log_dir = os.path.join(current_dir, "log")
if not os.path.exists(log_dir):
os.makedirs(log_dir, exist_ok=True) # 增加exist_ok避免多线程冲突
log_file = os.path.join(log_dir, "search_book.log")
# 创建文件处理器(延迟打开文件)
file_handler = logging.FileHandler(log_file, encoding='utf-8', delay=True)
file_handler.setLevel(logging.INFO)
# 设置日志格式
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
# 添加处理器
logger.addHandler(file_handler)
return logger
# 创建全局logger实例
logger_es = setup_logger()
# 1、文件上传API(接收拆书传入的excel文件)
@app.post("/upload/excel", response_model=UploadResponse)
async def upload_excel(
book_id: str = Form(...), # 必填字段
book_name: str = Form(...), # 必填字段
file: UploadFile = File(...)
):
# 检查文件类型是否允许
if not allowed_file(file.filename):
logger_es.info(f"/upload/excel : 文件类型不允许。允许的类型: {', '.join(ALLOWED_EXTENSIONS)}")
raise HTTPException(
status_code=400,
detail=f"文件类型不允许。允许的类型: {', '.join(ALLOWED_EXTENSIONS)}"
)
# 1、生成安全的文件名
file_ext = os.path.splitext(file.filename)[1]
safe_filename = f"{book_id}_{book_name}{file_ext}"
file_path = UPLOAD_DIR / safe_filename
# 2、保存excel文件
try:
with open(file_path, "wb") as f:
contents = await file.read()
f.write(contents)
except Exception as e:
logger_es.info(f"/upload/excel : 保存excel文件时出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"保存excel文件时出错: {str(e)}")
finally:
await file.close()
# 3、保存清洗后的excel进json文件
try:
sheet_name = "题库数据"
data = pre_data.read_excel(file_path, sheet_name)
flag, data = pre_data.prepare_data(data, book_id, book_name)
if not flag:
logger_es.info(f"/upload/excel : excel“题目文本”列清洗后为空: {str(data)}")
raise HTTPException(status_code=510, detail=f"excel“题目文本”列清洗后为空: {str(data)}")
safe_filename_json = f"{book_id}_{book_name}.json"
file_path_json = UPLOAD_DIR / safe_filename_json
pre_data.write2json(file_path_json, data) # excel保存为json数据
except Exception as e:
logger_es.info(f"/upload/excel : 清洗excel数据出错: {str(e)}")
raise HTTPException(status_code=520, detail=f"清洗excel数据出错: {str(e)}")
# 4、将拆书数据插入es数据库
try:
es.insert_doc_batch(index_name, data)
except Exception as e:
logger_es.info(f"/upload/excel : 数据插入es出错: {str(e)}")
raise HTTPException(status_code=530, detail=f"数据插入es出错: {str(e)}")
logger_es.info(f"/upload/excel : 数据插入es success: book_id:{str(book_id)};book_name:{book_name};file: {safe_filename}")
return {
"status": "success",
"message": "文件插入 ES 成功",
"filename": safe_filename,
"file_path": str(file_path),
"book_id": book_id,
"book_name": book_name
}
# 2、依据book_id删除es库里对应的数据
@app.post("/delete_book")
async def delete_book(input: DeleteBook):
book_id = input.book_id
logger_es.info(f"delete_book : {input}")
try:
es.delete_book_id(index_name, book_id)
except Exception as e:
logger_es.info(f"delete_book : 删除 book_id:{book_id} 数据出错:{e}。")
raise HTTPException(status_code=500, detail=f"删除 book_id:{book_id} 数据出错:{e}。")
return { "status": "success", "msg":f"成功删除 book_id:{book_id} 图书数据。"}
# 3、搜书搜题
@app.post("/search_book_quesiton")
async def search_book_quesiton(input: SearchBook):
logger_es.info(f"search_book_quesiton : {input}")
book_id = input.book_id
image_ocr = input.image_ocr
top_k = input.top_k
try:
res = await es.search_question( index_name, book_id, image_ocr, top_k)
except Exception as e:
logger_es.info(f"search_book_quesiton : 搜索 book_id:{book_id},image_ocr:{image_ocr},出错:{e}。")
raise HTTPException(status_code=500, detail=f"搜索 book_id:{book_id},image_ocr:{image_ocr},出错:{e}。")
return {"status": "success", "es_search":res}
if __name__ == "__main__":
uvicorn.run(app="api_service:app", host="0.0.0.0", port=31000, workers=1) # 部署的服务是 31000
# nohup python -u api_service.py > api_service.log 2>&1 &
# tail -f log/search_book.log
import requests
import re,json,os,sys
def get_page_image_url(bankId="760261"):
# 输入题目的bankid 输出题目所在整页的url链接
url = "https://rays7.5rs.me/matrix/v1.0/aIRecognized/getPageInfoByBankId"
headers = {
"token": "whlg2025!",
"Only-For-Backend": "y"
}
data = {
"bankId": bankId
}
response = requests.get(url, headers=headers, params=data)
response = response.text
sourceImageUrl = json.loads(response)['data']['sourceImageUrl']
return sourceImageUrl
# get_page_image_url()
def insert_excel(data):
# 1、excel 数据插入es数据库
url = 'http://localhost:31000/upload/excel'
# url = 'http://61.170.32.8:31000/upload/excel'
headers = { 'accept': 'application/json' }
file_path = data['file_path']
# 上传文件
with open(file_path, 'rb') as file:
files = {
'file': (file_path, file),
}
# 发送请求
response = requests.post(url, headers=headers, data=data, files=files)
# 输出响应结果
print(f"状态码: {response.status_code}")
print(f"响应内容: {response.json()}")
def delete_es_book(book_id="12663121"):
# 2、指定book_id 删除es中对应的图书数据
url = 'http://localhost:31000/delete_book'
# url = 'http://61.170.32.8:31000/delete_book'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
res = json.dumps(res, ensure_ascii=False, indent=4)
print(res)
def search_book_question(book_id, image_ocr):
# 3、搜书搜题
url = 'http://localhost:31000/search_book_quesiton'
url = 'http://61.170.32.8:31000/search_book_quesiton'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"image_ocr": image_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['questions'][0]['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
if __name__ == "__main__":
# 1、es中插入excel数据
# data = {'book_id': '12663121', 'book_name': '衔接教材地理', "file_path": '/home/liuxin/work/search_book/data/12663121-衔接教材 地理.pdf.xlsx' } # 64
data = {'book_id': '12667382', 'book_name': '25春北师一数暑假作业', "file_path": '/home/liuxin/work/search_book/data/12667382一年级数学-原图/12667382-25春北师一数暑假作业.pdf_paddleOCR.xlsx'} # 44
# data = {'book_id': '12664728', 'book_name': '课计划·七年级英语·RJ·上册(出血文件)', "file_path": '/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册-原图/12664728-课计划·七年级英语·RJ·上册(出血文件).pdf_paddleOCR.xlsx'} # 96
# data = {'book_id': '12670279', 'book_name': '2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8', "file_path": '/home/liuxin/work/search_book/data/12670279-地理-原图/12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8.pdf_paddleOCR.xlsx'} # 26
# insert_excel(data)
# # 2、指定 book_id 删除es中对应的数据
book_id = "12664728"
book_id = "12670279"
book_id = "12667382"
# book_id = "12663121"
# delete_es_book(book_id)
# # # 3、搜书搜题
book_id = "12664728"
image_ocr = "student in my class is happy to meet our new teacher."
res = search_book_question(book_id, image_ocr)
print(res)
print("\nfinished.")
# !/usr/bin/env python
# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import numpy as np
import json
# 连接 Elasticsearch
es = Elasticsearch(
hosts = ["http://192.168.8.224:19200"], # 王磊搭建的新的es索引
# basic_auth=("elastic", "password") # 替换为你的认证信息
)
# 创建索引(如果不存在)
def create_index(index_name):
if not es.indices.exists(index=index_name):
index_settings = {
"mappings": {
"properties": {
"image_url": {"type": "keyword"},
"image_description": {"type": "text"},
"image_vector": {
"type": "dense_vector",
"dims": 512, # 向量维度,根据你的向量模型调整
"index": True,
"similarity": "cosine"
}
}
}
}
es.indices.create(index=index_name, body=index_settings)
print(f"索引 {index_name} 创建成功")
# 插入单条图片数据
def insert_single_image(index_name, image_url, image_description, image_vector):
doc = {
"image_url": image_url,
"image_description": image_description,
"image_vector": image_vector
}
es.index(index=index_name, document=doc)
print(f"图片 {image_url} 已插入")
# 批量插入图片数据
def insert_bulk_images(index_name, image_data_list):
actions = []
for data in image_data_list:
action = {
"_index": index_name,
"_source": {
"image_url": data["image_url"],
"image_description": data["image_description"],
"image_vector": data["image_vector"]
}
}
actions.append(action)
bulk(es, actions)
print(f"批量插入 {len(actions)} 条数据成功")
# 文本检索
def text_search(index_name, query_text, size=10):
query = {
"match": {
"image_description": query_text
}
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# 向量检索
def vector_search(index_name, query_vector, size=10):
query = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'image_vector') + 1.0",
"params": {"query_vector": query_vector}
}
}
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# 向量-文本联合检索
def hybrid_search(index_name, query_text, query_vector, text_weight=0.5, vector_weight=0.5, size=10):
query = {
"bool": {
"should": [
{
"match": {
"image_description": {
"query": query_text,
"boost": text_weight
}
}
},
{
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'image_vector') * params.vector_weight",
"params": {
"query_vector": query_vector,
"vector_weight": vector_weight
}
}
}
}
]
},
}
results = es.search(index=index_name, query=query, size=size)
return results['hits']['hits']
# RRF融合函数
def rrf_fusion(text_results, vector_results, k=60):
"""
使用RRF算法融合两种检索结果
参数:
text_results: es文本检索结果
vector_results: es向量检索结果
k: RRF算法中的rank常数,一般取60
返回:
融合后的结果列表
"""
# 构建文档ID到文本检索排名的映射
text_rank_map = {hit['_id']: rank + 1 for rank, hit in enumerate(text_results)}
# 构建文档ID到向量检索排名的映射
vector_rank_map = {hit['_id']: rank + 1 for rank, hit in enumerate(vector_results)}
# 合并所有文档ID
all_doc_ids = set(text_rank_map.keys()).union(set(vector_rank_map.keys()))
# 计算RRF分数
rrf_scores = {}
for doc_id in all_doc_ids:
text_rank = text_rank_map.get(doc_id, float('inf'))
vector_rank = vector_rank_map.get(doc_id, float('inf'))
# RRF公式: 1/(k + rank)
text_score = 1 / (k + text_rank) if text_rank != float('inf') else 0
vector_score = 1 / (k + vector_rank) if vector_rank != float('inf') else 0
rrf_scores[doc_id] = text_score + vector_score
# 按RRF分数排序
sorted_doc_ids = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
# 构建最终结果
final_results = []
# 优先从文本结果中获取完整信息
id_to_hit = {hit['_id']: hit for hit in text_results}
# 补充向量结果中的文档
for hit in vector_results:
if hit['_id'] not in id_to_hit:
id_to_hit[hit['_id']] = hit
for doc_id, score in sorted_doc_ids:
if doc_id in id_to_hit:
result = {
'_id': doc_id,
'_score': score,
'_source': id_to_hit[doc_id]['_source']
}
final_results.append(result)
return final_results
# RRF向量-文本联合检索
def hybrid_search_rrf(index_name, query_text, query_vector, size=10, rrf_k=60):
"""使用RRF融合文本和向量检索结果"""
# 执行文本和向量检索
text_hits = text_search(index_name, query_text)
vector_hits = vector_search(index_name, query_vector)
# 使用RRF融合结果
fused_results = rrf_fusion(text_hits, vector_hits, k=rrf_k)
# 返回前size个结果
return fused_results[:size]
# 示例用法
if __name__ == "__main__":
INDEX_NAME = "tst_images_text"
# # 1、创建索引
create_index(INDEX_NAME)
# 示例数据
sample_images = [
{
"image_url": "https://oss.5rs.me/oss/upload/image/jpeg/b2f3fbaa98a3f2552f2dba7451decf7b_36_20250605135114454.jpeg",
"image_description": "、填一填。培优作业18 80 78 2.比 16 少9的数是( )多8的数是 17;27 );比( 与 30的和是( );47比3多( )个十和( )个一;7个十和2个一 3.83 里面有( 合起来是( ),它们 4.最大的两位数是(),最小的三位数是(相差(5.写一写,比一比。百位十位个位百位十位个位百位十位个位选择。(把正确答案的序号填在括号里)1.根据艾蒿叶子的影子判断生长时间最短的叶子是(2.在远古时代,为了记录猎物的多少,人们常用石子计数。假设用0表示10.用一表示1,那么下面表示的数最大的是",
"image_vector": np.random.random(512).tolist() # 替换为实际的图片向量
},
{
"image_url": "https://oss.5rs.me/oss/upload/image/jpeg/9579d101ec560c85f204fa11b7a792c0_2_20250605094300322.jpeg",
"image_description": "、直接写得数。培优作业1*18 x10=50 x90 =18 x41~51 x22~30x40 =12 x70 =32 x49 ~17 x78 ~20 x15 =44 x20 =78 x11~46 x23 ~二、填一填。1.13的30倍是(),10个22的和是()位数,积大约是(2.49 x28 的积是()。3.25x40的积的末尾有()个0。4.小亮平均每分钟走69米,他从家走到学校用了12分钟。小亮家离学校大约有()米。5.晶晶平均每分钟可以写25个字,照这样的速度,她15分钟可以写)个字。三、用竖式计算,并验算。18 x51 =32 x60 =46 x25 =四、在( )里填上“ >”“<”或“=”30 x30( )100070x12()80034 x56)36 x5415 x40( )60011 x44(訕興癪9 x13)450)13 x8921 x3425 x801圧ʌ帳八廈仈 x30)600)200023 x45",
"image_vector": np.random.random(512).tolist() # 替换为实际的图片向量
}
]
# # 2、批量插入
# insert_bulk_images(INDEX_NAME, sample_images)
# # 3、文本检索示例
print("\n3、文本检索结果:")
text_results = text_search(INDEX_NAME, "填一填")
for hit in text_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 4、向量检索示例
print("\n4、向量检索结果:")
vector_results = vector_search(INDEX_NAME, sample_images[0]["image_vector"])
for hit in vector_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 5、联合检索示例
print("\n5、联合检索结果:")
hybrid_results = hybrid_search(
INDEX_NAME,
"可爱的动物",
sample_images[0]["image_vector"],
text_weight=0.4,
vector_weight=0.6
)
for hit in hybrid_results:
print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
print()
# # 6、RRF联合检索示例
print("\n6、RRF联合检索结果:")
hybrid_results = hybrid_search_rrf(
INDEX_NAME,
"填一填。培优作业18 80 78 2.比 16 少9的数是( )多8的数是 17;27 );比( 与 30的和是( );47比3多( )个十和( )个一;7个十和2个",
sample_images[0]["image_vector"],
size=10,
rrf_k=60
)
for hit in hybrid_results:
print(f"得分: {hit['_score']:.4f}, \n描述: {hit['_source']['image_description']}")
print()
print("\nfinished.")
from tritonclient import grpc as triton_grpc
import numpy as np
import json
from PIL import Image
import io
import base64
INPUT_NAME = "input"
OUTPUT_NAME = "output"
def _create_triton_input(data):
data = json.dumps(data, separators=(",", ":"))
data = data.encode("utf-8")
data = [[data]]
data = np.array(data, dtype=np.object_)
return data
def _parse_triton_output(data):
data = data[0, 0]
data = data.decode("utf-8")
data = json.loads(data)
return data
def triton_request(client, data, *, request_kwargs=None):
if request_kwargs is None:
request_kwargs = {}
input_ = triton_grpc.InferInput(INPUT_NAME, [1, 1], "BYTES")
input_.set_data_from_numpy(_create_triton_input(data))
results = client.infer('ocr', inputs=[input_], **request_kwargs)
output = results.as_numpy(OUTPUT_NAME)
return _parse_triton_output(output)
def infer_paddle_ocr(im:Image.Image, client):
buffer = io.BytesIO()
im.save(buffer, format='JPEG')
buffer.seek(0)
base64_file = base64.b64encode(buffer.read()).decode("ascii")
input_ = {"file": base64_file,
"fileType": 1}
while True:
output_ocr = triton_request(client, input_)
errorCode = output_ocr["errorCode"]
if errorCode == 0:
result = output_ocr["result"]
res_ocr = result["ocrResults"][0]['prunedResult']
db_boxes = res_ocr['rec_boxes']
rec_texts = res_ocr['rec_texts']
break
return db_boxes, rec_texts
if __name__=='__main__':
paddle_ocr_url = '61.170.32.8:38896'
client_ocr = triton_grpc.InferenceServerClient(paddle_ocr_url)
im = '/data/liyaze/doc2xkey_server_v2/utils/my-obs-object-key-demo.jpg'
im = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册/1.jpg"
im = Image.open(im).convert('RGB')
db_boxes, rec_texts = infer_paddle_ocr(im, client_ocr)
print(rec_texts)
\ No newline at end of file
import re,json,sys,os
import pandas as pd
import numpy as np
class prepareData():
def __init__(self):
self.index = "index" # int
self.fileId = "bankId" # int
self.question_image_url = "question_image_url"
self.question_text = "question_text"
self.answer_image_url = "answer_image_url"
self.answer_text = "answer_text"
self.question_chapter = "question_chapter"
self.question_category = "question_category"
self.page_id = "page_id" # int
self.book_page_id = "book_page_id"
self.knowledge_point = "knowledge_point"
self.knowledge_point_code = "knowledge_point_code"
self.difficult = "difficult"
self.resolve = "resolve"
self.appendix = "appendix"
self.source_image_url = "source_image_url"
self.page_ocr = "page_ocr"
self.title2tile = {
"序号": self.index,
"题目id": self.fileId,
"题目图片地址": self.question_image_url,
"题目文本": self.question_text,
"答案图片地址": self.answer_image_url,
"答案文本": self.answer_text,
"所属章节": self.question_chapter,
"题目类型": self.question_category,
"电子页码": self.page_id,
"纸书页码": self.book_page_id,
"知识点": self.knowledge_point,
"知识点代码": self.knowledge_point_code,
"难度": self.difficult,
"解析": self.resolve,
"附件": self.appendix,
"source_image_url": self.source_image_url,
"page_ocr":self.page_ocr
}
def clean(self, text):
text = re.sub(r'<img[^>]*>', '', text)
text = re.sub(r"</p><p>", " ", text)
text = re.sub(r"<p>", " ", text)
text = re.sub(r"</p>", " ", text)
text = text.strip()
return text
def read_excel(self,excel_path, sheet_name="Sheet1"):
data = pd.read_excel(excel_path, sheet_name)
data = data.rename(columns=self.title2tile)
data.fillna("")
data = data.to_dict(orient='records')
for item in data:
item[self.index] = int(item[self.index])
item[self.fileId] = int(item[self.fileId])
item[self.question_image_url] = str(item[self.question_image_url])
item[self.question_text] = str(item[self.question_text])
item[self.answer_image_url] = str(item[self.answer_image_url])
item[self.answer_text] = str(item[self.answer_text])
item[self.question_chapter] = str(item[self.question_chapter])
item[self.question_category] = str(item[self.question_category])
item[self.page_id] = int(item[self.page_id])
item[self.book_page_id] = str(item[self.book_page_id])
item[self.knowledge_point] = str(item[self.knowledge_point])
item[self.knowledge_point_code] = str(item[self.knowledge_point_code])
item[self.difficult] = str(item[self.difficult])
item[self.resolve] = str(item[self.resolve])
item[self.appendix] = str(item[self.appendix])
item.pop('question_category')
item.pop('appendix')
return data
def prepare_data(self, data_list, book_id, book_name):
save_data_temp = {}
for line in data_list:
# line[self.question_text] = self.clean(line[self.question_text])
page_id = line[self.page_id]
if page_id in save_data_temp:
save_data_temp[page_id].append(line)
else:
save_data_temp[page_id] = [line]
save_data = []
flag = True
count = 1
for page_id, questions in save_data_temp.items():
page_id = questions[0]['page_id']
page_ocr = questions[0]['page_ocr']
page_ocr = json.loads(page_ocr)['rec_texts']
page_ocr = " ".join(page_ocr)
image_ocr_clean = []
for question in questions:
question_text = question[self.question_text]
question_text = self.clean(question_text)
image_ocr_clean.append(question_text)
question.pop("page_ocr")
# if not image_ocr_clean:
# flag = False
# return flag, json.dumps(questions, ensure_ascii=False) # 确保excel题目文本列 清洗后 文本不为空
image_ocr_clean = "\n".join(image_ocr_clean)
temp = {"book_id_page_id":str(book_id)+"-"+str(page_id), "book_id": book_id, "book_name": book_name, "image_ocr_clean": image_ocr_clean, "page_ocr":page_ocr, "questions": questions}
count += 1
save_data.append(temp)
return flag, save_data
def write2json(self, path, data_list):
with open(path, "w", encoding='utf-8') as f:
data_list = json.dumps(data_list, ensure_ascii=False, indent=4)
f.write(data_list)
print(f"success save: {path} ")
def result1(self):
excel_path = "/home/liuxin/work/search_book/data/12663121-衔接教材 地理.pdf.xlsx"
save_path = excel_path[:-4] + "json"
sheet_name = "题库数据"
book_id = "12663121"
book_name = "衔接教材 地理"
data_list = self.read_excel( excel_path, sheet_name=sheet_name )
flag, save_data = self.prepare_data( data_list, book_id, book_name)
if not flag:
print(book_name)
print("excel题目文本列 清洗后 文本不为空:")
print("book_id: ",book_id)
# print(save_data)
self.write2json( save_path, save_data)
def result2(self):
# excel_path = "/home/liuxin/work/search_book/data/12667382-25春北师一数暑假作业.pdf.xlsx"
excel_path = "/home/liuxin/work/search_book/data/12667382一年级数学-原图/12667382-25春北师一数暑假作业.pdf_paddleOCR.xlsx"
save_path = excel_path[:-4] + "json"
sheet_name = "题库数据"
book_id = "12667382"
book_name = "25春北师一数暑假作业"
data_list = self.read_excel( excel_path, sheet_name=sheet_name )
flag, save_data = self.prepare_data( data_list, book_id, book_name)
# if not flag:
# print(book_name)
# print("excel题目文本列 清洗后 文本不为空:")
# print("book_id: ",book_id)
# # print(save_data)
self.write2json( save_path, save_data)
def result3(self):
# excel_path = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册(出血文件).pdf.xlsx"
excel_path = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册-原图/12664728-课计划·七年级英语·RJ·上册(出血文件).pdf_paddleOCR.xlsx"
save_path = excel_path[:-4] + "json"
sheet_name = "题库数据"
book_id = "12664728"
book_name = "课计划·七年级英语·RJ·上册(出血文件)"
data_list = self.read_excel( excel_path, sheet_name=sheet_name )
flag, save_data = self.prepare_data( data_list, book_id, book_name)
if not flag:
print(book_name)
print("excel题目文本列 清洗后 文本不为空:")
print("book_id: ",book_id)
# print(save_data)
self.write2json( save_path, save_data)
def result4(self):
# excel_path = "/home/liuxin/work/search_book/data/12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8.pdf.xlsx"
excel_path = "/home/liuxin/work/search_book/data/12670279-地理-原图/12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8.pdf_paddleOCR.xlsx"
save_path = excel_path[:-4] + "json"
sheet_name = "题库数据"
book_id = "12670279"
book_name = "2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件"
data_list = self.read_excel( excel_path, sheet_name=sheet_name )
flag, save_data = self.prepare_data( data_list, book_id, book_name)
if not flag:
print(book_name)
print("excel题目文本列 清洗后 文本不为空: ")
print("book_id: ",book_id)
# print(save_data)
self.write2json( save_path, save_data)
if __name__ == "__main__":
prepare_data = prepareData()
# prepare_data.result1()
prepare_data.result2()
prepare_data.result3()
prepare_data.result4()
print("\nfinished.")
import json,re, os
import pandas as pd
from tritonclient import grpc as triton_grpc
from paddle_ocr import infer_paddle_ocr
from PIL import Image
import requests
from api_tst import search_book_question, get_page_image_url
class AddPaddleOCR():
# 给每张图片添加paddleOCR的结果
def __init__(self):
paddle_ocr_url = '61.170.32.8:38896'
self.client_ocr = triton_grpc.InferenceServerClient(paddle_ocr_url)
pass
def download_image(self, image_url, save_path):
# 发送HTTP请求获取图片内容
response = requests.get(image_url, stream=True, timeout=10)
# 检查请求是否成功
response.raise_for_status()
# 确保保存路径的目录存在
directory = os.path.dirname(save_path)
if not os.path.exists(directory):
os.makedirs(directory)
# 写入文件
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
def paddle_ocr(self, file):
# im = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册/1.jpg"
im = Image.open(file).convert('RGB')
db_boxes, rec_texts = infer_paddle_ocr(im, self.client_ocr)
paddle_ocr = {"db_boxes":db_boxes, "rec_texts":rec_texts}
paddle_ocr = json.dumps(paddle_ocr, ensure_ascii=False)
return paddle_ocr
def read_excel(self,excel_path, save_image_path):
sheet_name = "image_url"
data = pd.read_excel(excel_path, sheet_name)
# data = data.rename(columns=self.title2tile)
data.fillna("")
data = data.to_dict(orient='records')
page_id2ocr = {}
for line in data:
page_id = line['电子页码']
page_id = str(page_id)
source_image_url = line['source_image_url']
print("paddle_ocr file:", source_image_url)
save_path = os.path.join(save_image_path, page_id + ".jpeg")
self.download_image(source_image_url, save_path)
paddle_ocr = self.paddle_ocr( save_path)
page_id2ocr[page_id] = {"source_image_url":source_image_url, "page_ocr":paddle_ocr}
sheet_name = "题库数据"
data2 = pd.read_excel(excel_path, sheet_name)
data2.fillna("")
data2 = data2.to_dict(orient='records')
for line in data2:
page_id = line['电子页码']
page_id = str(page_id)
source_image_url = page_id2ocr[page_id]['source_image_url']
page_ocr = page_id2ocr[page_id]['page_ocr']
line['source_image_url'] = source_image_url
line['page_ocr'] = page_ocr
save_data = pd.DataFrame(data2)
path_file = excel_path[:-5] + "_paddleOCR.xlsx"
save_data.to_excel(path_file)
print("save_path: ", path_file)
print("\n\n")
AddPaddleocr = AddPaddleOCR()
file_list = ["/home/liuxin/work/search_book/data/12667382-25春北师一数暑假作业.pdf.xlsx",
"/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册(出血文件).pdf.xlsx",
"/home/liuxin/work/search_book/data/12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8.pdf.xlsx"
]
source_image_path = ["/home/liuxin/work/search_book/data/12667382一年级数学-原图",
"/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册-原图",
"/home/liuxin/work/search_book/data/12670279-地理-原图",
]
# 获取拆书图片的paddleOCR结果
# for excel_path, image_path in zip(file_list, source_image_path):
# AddPaddleocr.read_excel(excel_path, image_path)
# 测试搜题的效果
class Test():
def __init__(self):
paddle_ocr_url = '61.170.32.8:38896'
self.client_ocr = triton_grpc.InferenceServerClient(paddle_ocr_url)
pass
def paddle_ocr(self, file):
# im = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册/1.jpg"
im = Image.open(file).convert('RGB')
db_boxes, rec_texts = infer_paddle_ocr(im, self.client_ocr)
paddle_ocr = {"db_boxes":db_boxes, "rec_texts":rec_texts}
paddle_ocr = json.dumps(paddle_ocr, ensure_ascii=False)
paddle_ocr_text = " ".join(rec_texts)
return paddle_ocr, paddle_ocr_text
def es_search(self, book_id, image_ocr):
url = 'http://localhost:31000/search_book_quesiton'
url = 'http://61.170.32.8:31000/search_book_quesiton'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"image_ocr": image_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data)
# 输出响应结果
# print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['questions'][0]['bankId']
# res = json.dumps(res, ensure_ascii=False, indent=4)
return bankId
def get_page_image_url(self, bankId):
# 获取整页图片的url
url = "https://rays7.5rs.me/matrix/v1.0/aIRecognized/getPageInfoByBankId"
headers = {
"token": "whlg2025!",
"Only-For-Backend": "y"
}
data = { "bankId": bankId }
response = requests.get(url, headers=headers, params=data)
response = response.text
sourceImageUrl = json.loads(response)['data']['sourceImageUrl']
return sourceImageUrl
def banch_image(self, path, book_id):
# 输入一个文件夹路径 (文件夹里全部是图片)
file_all = []
paddle_ocr_all = []
paddle_ocr_text_all = []
bankId_all = []
sourceImageUrl_all = []
file_list = os.listdir(path)
file_list = [item for item in file_list if item.endswith('jpg')]
file_list = [int(item[:-4]) for item in file_list]
file_list = sorted(file_list)
file_list = [str(item)+".jpg" for item in file_list]
print(f"共 {len(file_list)} 张图片。")
# file_list = file_list[:3]
for file0 in file_list:
file_all.append(file0)
file = os.path.join(path, file0)
paddle_ocr, paddle_ocr_text = self.paddle_ocr( file)
bankId = self.es_search(book_id, paddle_ocr_text)
sourceImageUrl = self.get_page_image_url( bankId)
paddle_ocr_all.append(paddle_ocr)
paddle_ocr_text_all.append(paddle_ocr_text)
bankId_all.append(bankId)
sourceImageUrl_all.append(sourceImageUrl)
print(file0, ": 搜索结果:", sourceImageUrl)
data = {"image_name":file_all, "paddle_ocr":paddle_ocr_all, "paddle_ocr_text":paddle_ocr_text_all, "bankId":bankId_all, "es_search_page_url":sourceImageUrl_all}
data = pd.DataFrame(data)
excel_path = os.path.join(path, f'{book_id}_测试.xlsx')
data.to_excel(excel_path)
def get_source_image_url(self, excel_file):
# 获取标注数据中 整张图片的 image_url
print(excel_file)
data = pd.read_excel(excel_file)
question_ids = data['题目id']
page_ids = data['电子页码']
page_id_res = []
question_id_res = []
source_image_url_res = []
for page_id, question_id in zip(page_ids, question_ids):
if page_id in page_id_res:
continue
page_id_res.append(page_id)
question_id_res.append(question_id)
image_url = self.get_page_image_url( question_id)
print(image_url)
source_image_url_res.append(image_url)
data = {"电子页码":page_id_res, "题目id":question_id_res, "source_image_url":source_image_url_res}
data = pd.DataFrame(data)
save_file = excel_file[:-5] + "SourceImageUrl.xlsx"
data.to_excel(save_file)
print(save_file)
def result1(self):
path = r"/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册" # 110页图
book_id = "12664728"
path = r"/home/liuxin/work/search_book/data/12667382一年级数学" # 44页图
book_id = "12667382"
path = "/home/liuxin/work/search_book/data/12670279-地理" # 26页图
book_id = "12670279"
self.banch_image(path, book_id)
if __name__ == "__main__":
test = Test()
test.result1() # 测试搜题的效果
# excel_file = ["/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册(出血文件).pdf.xlsx",
# "/home/liuxin/work/search_book/data/12667382-25春北师一数暑假作业.pdf.xlsx",
# "/home/liuxin/work/search_book/data/12663121-衔接教材 地理.pdf.xlsx",
# "/home/liuxin/work/search_book/data/12670279-2025版ZT7地-下册横版《锦上添花(期末大赢家)》下册-新编文件5.8.pdf.xlsx"
# ]
# for file in excel_file:
# test.get_source_image_url( file)
print("\nfinished.")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment