Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
search_question
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
刘鑫
search_question
Commits
8058a38d
Commit
8058a38d
authored
Jul 31, 2025
by
unknown
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add image url post
parent
052f0308
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
67 additions
and
36 deletions
+67
-36
README.md
README.md
+39
-8
api_service.py
api_service.py
+28
-28
api_tst.py
api_tst.py
+0
-0
No files found.
README.md
View file @
8058a38d
...
@@ -22,7 +22,7 @@ conda虚拟环境:conda activate search_book
...
@@ -22,7 +22,7 @@ conda虚拟环境:conda activate search_book
准备工作:
准备工作:
1、将习题图书拆分为整页整页的完整图片,并使用合合OCR解析整页图片,放入excel的page_ocr列。
1、将习题图书拆分为整页整页的完整图片,并使用合合OCR解析整页图片,放入excel的page_ocr列。
2、将习题图书拆分为一个个独立的题目图片,并使用合合OCR解析题目图片,将ocr结果放入question_ocr列。将题目图片和题目合合eOCR结果一起输入豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量,处理为json字符串后放入excel的“question_image_embedding”列里。
2、将习题图书拆分为一个个独立的题目图片,并使用合合OCR解析题目图片,将ocr结果放入question_ocr列。将题目图片和题目合合eOCR结果一起输入豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量,处理为json字符串后放入excel的“question_image_embedding”列里。
3、后端将以上excel数据通过接口插入es数据库(excel文档样例:
“12668836-2025.xlsx”),在原有拆书平台下载的excel文档中添加【question_ocr(题目的ocr结果),source_image_url(整页图片url),page_ocr(整页图片ocr),question_image_embedding(题目ocr和题目原图的向量)】列
3、后端将以上excel数据通过接口插入es数据库(excel文档样例:
12668836-2025.xlsx)
搜整页图片流程:
搜整页图片流程:
...
@@ -108,7 +108,6 @@ delete_es_book(book_id)
...
@@ -108,7 +108,6 @@ delete_es_book(book_id)
# 3、搜整页图片接口
# 3、搜整页图片接口
def search_book_question(book_id, image_ocr):
def search_book_question(book_id, image_ocr):
# 3、搜书搜题
url = 'http://localhost:31001/search_page'
url = 'http://localhost:31001/search_page'
url = 'http://61.170.32.8:31001/search_page'
url = 'http://61.170.32.8:31001/search_page'
headers = {
headers = {
...
@@ -142,8 +141,6 @@ print(res)
...
@@ -142,8 +141,6 @@ print(res)
#### 4、搜题接口 (只用题目合合OCR结果)
#### 4、搜题接口 (只用题目合合OCR结果)
```
```
# 4、搜题(只用题目paddleOCR结果)接口
def search_question_text(book_id, question_ocr):
def search_question_text(book_id, question_ocr):
url = 'http://localhost:31001/search_question_text'
url = 'http://localhost:31001/search_question_text'
# url = 'http://61.170.32.8:31001/search_question_text'
# url = 'http://61.170.32.8:31001/search_question_text'
...
@@ -178,9 +175,7 @@ search_question_text(book_id, question_ocr)
...
@@ -178,9 +175,7 @@ search_question_text(book_id, question_ocr)
#### 5、搜题接口 (题目合合ocr和题目图片作为一个整体的向量)
#### 5、搜题接口 (题目合合ocr和题目图片作为一个整体的向量)
```
```
# 5、搜题接口 (题目合合ocr和题目图片作为一个整体的向量)
def search_question_embedding(book_id:str, question_embedding:str):
def search_question_embedding(book_id:str, question_embedding):
url = 'http://localhost:31001/search_question_embedding'
url = 'http://localhost:31001/search_question_embedding'
# url = 'http://61.170.32.8:31001/search_question_embedding'
# url = 'http://61.170.32.8:31001/search_question_embedding'
headers = {
headers = {
...
@@ -190,7 +185,7 @@ def search_question_embedding(book_id:str, question_embedding):
...
@@ -190,7 +185,7 @@ def search_question_embedding(book_id:str, question_embedding):
# 表单数据
# 表单数据
data = {
data = {
"book_id": book_id,
"book_id": book_id,
"question_embedding": question_embedding,
"question_embedding": question_embedding,
# 1024维
"top_k": 1
"top_k": 1
}
}
data = json.dumps(data, ensure_ascii=False)
data = json.dumps(data, ensure_ascii=False)
...
@@ -214,6 +209,42 @@ res=search_question_embedding(book_id, question_embedding)
...
@@ -214,6 +209,42 @@ res=search_question_embedding(book_id, question_embedding)
#### 6、搜题接口 (输入题目图片合合ocr和题目图片url链接)
```
def search_question_embedding2(book_id, question_image_url, question_image_ocr):
url = 'http://localhost:31001/search_question_embedding'
url = 'http://61.170.32.8:31001/search_question_embedding'
headers = {
'accept': 'application/json',
}
# 表单数据
data = {
"book_id": book_id,
"question_image_url": question_image_url,
"question_image_ocr": question_image_ocr,
"top_k": 1
}
data = json.dumps(data, ensure_ascii=False)
response = requests.post(url, headers=headers, data=data )
# 输出响应结果
print(f"状态码: {response.status_code}")
res = response.json()
bankId = res['es_search'][0]['_source']['bankId']
print("bankId: ", bankId)
res = json.dumps(res, ensure_ascii=False, indent=4)
return res
book_id = "12670279"
question_image_ocr = "题目图片的合合ocr结果"
question_image_url = "https://example.123.jpg"
res=search_question_embedding(book_id, question_image_url, question_image_ocr)
```
#### 6、启动服务
#### 6、启动服务
```
linux
```
linux
...
...
api_service.py
View file @
8058a38d
...
@@ -5,7 +5,7 @@ from pathlib import Path
...
@@ -5,7 +5,7 @@ from pathlib import Path
from
pydantic
import
BaseModel
from
pydantic
import
BaseModel
import
uvicorn
import
uvicorn
import
logging
import
logging
from
tools
import
doubao_image_embedding
from
save_es_database
import
EsHelper
from
save_es_database
import
EsHelper
from
prepare_data
import
prepareData
from
prepare_data
import
prepareData
...
@@ -51,7 +51,9 @@ class SearchQuestionText(BaseModel):
...
@@ -51,7 +51,9 @@ class SearchQuestionText(BaseModel):
# 搜题目请求数据结构(仅输入题目图片的向量)
# 搜题目请求数据结构(仅输入题目图片的向量)
class
SearchQuestionEmbedding
(
BaseModel
):
class
SearchQuestionEmbedding
(
BaseModel
):
book_id
:
str
book_id
:
str
question_embedding
:
str
# 题目的图片的embedding ; json.dumps([])
question_image_url
:
str
=
None
# 题目图片的url链接
question_image_ocr
:
str
=
None
# 题目图片的合合ocr结果
question_embedding
:
str
=
None
# 题目图片和合合ocr文本的豆包1024维向量
top_k
:
int
=
1
top_k
:
int
=
1
# 搜题目请求数据结构(输入题目图片的向量 和题目图片的OCR结果)
# 搜题目请求数据结构(输入题目图片的向量 和题目图片的OCR结果)
...
@@ -225,37 +227,35 @@ async def search_question_text(input: SearchQuestionText):
...
@@ -225,37 +227,35 @@ async def search_question_text(input: SearchQuestionText):
# 4、搜图中的某个题目(只使用题目图片和ocr的向量)
# 4、搜图中的某个题目(只使用题目图片和ocr的向量)
@app.post
(
"/search_question_embedding"
)
@app.post
(
"/search_question_embedding"
)
async
def
search_question_embedding
(
input
:
SearchQuestionEmbedding
):
async
def
search_question_embedding
(
input
:
SearchQuestionEmbedding
):
logger_es
.
info
(
f
"search_question_embedding :
book_id: {input.book_id
}"
)
logger_es
.
info
(
f
"search_question_embedding :
{input
}"
)
book_id
=
input
.
book_id
book_id
=
input
.
book_id
question_embedding
=
input
.
question_embedding
question_image_url
=
input
.
question_image_url
question_image_ocr
=
input
.
question_image_ocr
top_k
=
input
.
top_k
top_k
=
input
.
top_k
try
:
question_embedding
=
json
.
loads
(
question_embedding
)
res
=
await
es
.
search_question_embedding
(
index_name
,
book_id
,
question_embedding
,
top_k
)
except
Exception
as
e
:
logger_es
.
info
(
f
"search_question_embedding : 搜索 book_id:{book_id},出错:{e}。"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"search_question_embedding 接口 book_id:{book_id},出错:{e}。"
)
return
{
"status"
:
"success"
,
"es_search"
:
res
}
# 5、搜图中某个题目(联合文本搜索和向量搜索)
@app.post
(
"/search_question_text_and_embedding"
)
async
def
search_question_text_and_embedding
(
input
:
SearchQuestionTextAndEmbedding
):
logger_es
.
info
(
f
"search_question_text_and_embedding : {input.book_id}"
)
book_id
=
input
.
book_id
question_ocr
=
input
.
question_ocr
question_embedding
=
input
.
question_embedding
question_embedding
=
input
.
question_embedding
top_k
=
input
.
top_k
if
not
question_embedding
and
question_image_url
and
question_image_ocr
:
try
:
question_embedding
=
doubao_image_embedding
(
question_image_url
,
question_image_ocr
)
# 豆包大模型做图片和文本的向量1024维向量
except
Exception
as
e
:
logger_es
.
info
(
f
"search_question_embedding : {input},豆包embedding error :{e}。"
)
raise
HTTPException
(
status_code
=
510
,
detail
=
f
"search_question_embedding: 豆包embedding error: {e}。"
)
try
:
try
:
question_embedding
=
json
.
loads
(
question_embedding
)
res
=
await
es
.
search_question_embedding
(
index_name
,
book_id
,
question_embedding
,
top_k
)
res
=
await
es
.
search_question_textAndEmbedding
(
index_name
,
book_id
,
question_ocr
,
question_embedding
,
top_k
)
except
Exception
as
e
:
except
Exception
as
e
:
logger_es
.
info
(
f
"search_question_embedding : 搜索 book_id:{book_id},出错:{e}。"
)
logger_es
.
info
(
f
"search_question_text_and_embedding : 搜索 book_id:{book_id},出错:{e}。"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"search_question_embedding 接口 book_id:{book_id},出错:{e}。"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"search_question_text_and_embedding 接口 book_id:{book_id},出错:{e}。"
)
elif
question_embedding
:
try
:
question_embedding
=
json
.
loads
(
question_embedding
)
res
=
await
es
.
search_question_embedding
(
index_name
,
book_id
,
question_embedding
,
top_k
)
except
Exception
as
e
:
logger_es
.
info
(
f
"search_question_embedding : 搜索 book_id:{book_id},出错:{e}。"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"search_question_embedding 接口 book_id:{book_id},出错:{e}。"
)
else
:
raise
HTTPException
(
status_code
=
520
,
detail
=
f
"the question_image_url must be efficient"
)
return
{
"status"
:
"success"
,
"es_search"
:
res
}
return
{
"status"
:
"success"
,
"es_search"
:
res
}
...
...
api_tst.py
View file @
8058a38d
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment