init

052f0308 · unknown · 4371d0fa · 052f0308 · 052f0308 · 052f0308
Commit 052f0308 authored Jul 29, 2025 by unknown
23 changed files
--- a/12668836-2025.xlsx
+++ b/12668836-2025.xlsx
--- a/README.md
+++ b/README.md
-hi 
\ No newline at end of file
+## 搜整页图片和搜题
+
+#### 1、需求
+
+需求1、依据用户拍摄的图书整页图片，搜索es库中对应的整页图片数据。
+
+需求2、依据用户拍摄图书某个题目图片， 搜索es库中对应的题目图片数据。
+
+
+
+服务器地址： 61.170.32.8
+
+开发文件路径：/home/liuxin/work/search_question
+
+conda虚拟环境：conda activate search_book
+
+
+
+#### 2、搜索方案
+
+```
+准备工作：
+1、将习题图书拆分为整页整页的完整图片，并使用合合OCR解析整页图片，放入excel的page_ocr列。
+2、将习题图书拆分为一个个独立的题目图片，并使用合合OCR解析题目图片，将ocr结果放入question_ocr列。将题目图片和题目合合eOCR结果一起输入豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量，处理为json字符串后放入excel的“question_image_embedding”列里。
+3、后端将以上excel数据通过接口插入es数据库（excel文档样例：“12668836-2025.xlsx”），在原有拆书平台下载的excel文档中添加【question_ocr(题目的ocr结果)，source_image_url(整页图片url)，page_ocr(整页图片ocr)，question_image_embedding(题目ocr和题目原图的向量)】列
+
+
+搜整页图片流程：
+1、用户手机拍摄图书完整页面图片并上传
+2、使用合合OCR对用户拍摄的图片进行解析
+3、搜整页图片api接口接收用户拍摄图片的ocr长字符串，使用长字符串在es中搜索
+4、返回es搜索的top 1数据内容。
+
+搜题目流程
+1、用户手机拍摄某个习题的图片并上传。
+2、使用合合OCR对用户拍摄的题目图片进行解析。
+3、将题目ocr结果和题目图片作为一个整体输入豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量。
+4、可只将题目图片合合OCR结果输入接口，进行搜题。也可以将1024维的向量输入接口进行搜题。
+5、只使用合合ocr搜题准确率99%。使用embedding向量搜题准确率100%。
+
+```
+
+
+
+#### 3、接口
+
+```python
+# 1、excel数据插入es数据库接口
+
+def insert_excel(data):
+    # 1、excel 数据插入es数据库
+    url = 'http://localhost:31001/upload/excel'
+    url = 'http://61.170.32.8:31001/upload/excel'
+    headers = { 'accept': 'application/json' }
+    file_path = data['file_path']
+    # 上传文件
+    with open(file_path, 'rb') as file:
+        files = {
+            'file': (file_path, file),
+        }
+
+        # 发送请求
+        response = requests.post(url, headers=headers, data=data, files=files)
+
+    # 输出响应结果
+    print(f"状态码: {response.status_code}")
+    print(f"响应内容: {response.json()}")
+    
+    
+data = {'book_id': '12670279', 'book_name': '衔接教材地理', "file_path": '12670279-2025版《锦上添花（期末大赢家）》下册.xlsx' }  
+
+insert_excel(data)
+
+```
+
+
+
+```python
+# 2、依据 book_id 删除es库中book_id相同的所有数据 接口
+
+def delete_es_book(book_id="12663121"):
+    # 2、指定book_id 删除es中对应的图书数据
+    url = 'http://localhost:31001/delete_book'
+    url = 'http://61.170.32.8:31001/delete_book'
+    headers = {
+        'accept': 'application/json',
+    }
+
+    # 表单数据
+    data = {
+        "book_id": book_id
+    }
+    data = json.dumps(data, ensure_ascii=False)
+    response = requests.post(url, headers=headers, data=data )
+    # 输出响应结果
+    print(f"状态码: {response.status_code}")
+    res = response.json()
+    res = json.dumps(res, ensure_ascii=False, indent=4)
+    print(res)
+    
+book_id = "12667382"
+delete_es_book(book_id)
+```
+
+
+
+```python
+# 3、搜整页图片接口
+
+def search_book_question(book_id, image_ocr):
+    # 3、搜书搜题
+    url = 'http://localhost:31001/search_page'
+    url = 'http://61.170.32.8:31001/search_page'
+    headers = {
+        'accept': 'application/json',
+    }
+
+    # 表单数据
+    data = {
+        "book_id": book_id,
+        "image_ocr": image_ocr,
+        "top_k": 1
+    }
+    data = json.dumps(data, ensure_ascii=False)
+    response = requests.post(url, headers=headers, data=data )
+    # 输出响应结果
+    print(f"状态码: {response.status_code}")
+    res = response.json()
+    bankId = res['es_search'][0]['_source']['bankId']
+    print("bankId: ", bankId)
+    res = json.dumps(res, ensure_ascii=False, indent=4)
+    return res
+
+book_id = "12664728"
+image_ocr = "用户拍摄图片整页的合合ocr结果"
+res = search_book_question(book_id, image_ocr)
+print(res)
+```
+
+
+
+#### 4、搜题接口 （只用题目合合OCR结果）
+
+```
+# 4、搜题（只用题目paddleOCR结果）接口
+
+def search_question_text(book_id, question_ocr):
+    url = 'http://localhost:31001/search_question_text'
+    # url = 'http://61.170.32.8:31001/search_question_text'
+    headers = {
+        'accept': 'application/json',
+    }
+
+    # 表单数据
+    data = {
+        "book_id": book_id,
+        "question_ocr": question_ocr,
+        "top_k": 1
+    }
+    data = json.dumps(data, ensure_ascii=False)
+    response = requests.post(url, headers=headers, data=data )
+    # 输出响应结果
+    print(f"状态码: {response.status_code}")
+    res = response.json()
+
+    bankId = res['es_search'][0]['_source']['bankId']
+    print("bankId: ", bankId)
+    res = json.dumps(res, ensure_ascii=False, indent=4)
+    return res
+
+book_id = "12667382"
+question_ocr = "题目图片合合ocr结果"
+search_question_text(book_id, question_ocr)
+```
+
+
+
+#### 5、搜题接口 （题目合合ocr和题目图片作为一个整体的向量）
+
+```
+# 5、搜题接口 （题目合合ocr和题目图片作为一个整体的向量）
+
+def search_question_embedding(book_id:str, question_embedding):
+    url = 'http://localhost:31001/search_question_embedding'
+    # url = 'http://61.170.32.8:31001/search_question_embedding'
+    headers = {
+        'accept': 'application/json',
+    }
+
+    # 表单数据
+    data = {
+        "book_id": book_id,
+        "question_embedding": question_embedding,
+        "top_k": 1
+    }
+    data = json.dumps(data, ensure_ascii=False)
+    response = requests.post(url, headers=headers, data=data )
+    # 输出响应结果
+    print(f"状态码: {response.status_code}")
+    res = response.json()
+
+    bankId = res['es_search'][0]['_source']['bankId']
+    print("bankId: ", bankId)
+    res = json.dumps(res, ensure_ascii=False, indent=4)
+    return res
+    
+book_id = "12670279"
+# 豆包embedding模型"doubao-embedding-vision-250615"得到一个1024维的向量
+question_embedding = np.random.random(1024).tolist() 
+question_embedding = json.dumps(question_embedding)
+
+res=search_question_embedding(book_id, question_embedding)
+```
+
+
+
+#### 6、启动服务
+
+```linux
+#  61.170.32.8 服务器
+$ cd /home/liuxin/work/search_question
+$ conda activate search_book
+$ nohup python -u api_service.py > api_service.log 2>&1 &
+$ tail -f log/search_question.log
+
+```
+
--- a/api_service.py
+++ b/api_service.py
--- a/api_tst.py
+++ b/api_tst.py
--- a/es_tst_vector_search.py
+++ b/es_tst_vector_search.py
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+import numpy as np
+import json
+
+
+# 连接 Elasticsearch
+es = Elasticsearch(
+    hosts = ["http://192.168.8.224:19200"],  # 王磊搭建的新的es索引
+    # basic_auth=("elastic", "password")  # 替换为你的认证信息
+)
+
+
+# 创建索引（如果不存在）
+def create_index(index_name):
+    if not es.indices.exists(index=index_name):
+        index_settings = {
+            "mappings": {
+                "properties": {
+                    "image_url": {"type": "keyword"},
+                    "image_description": {"type": "text"},
+                    "image_vector": {
+                        "type": "dense_vector",
+                        "dims": 2048,  # 向量维度，根据你的向量模型调整
+                        "index": True,
+                        "similarity": "cosine"
+                    }
+                }
+            }
+        }
+        es.indices.create(index=index_name, body=index_settings)
+        print(f"索引 {index_name} 创建成功")
+
+
+# 插入单条图片数据
+def insert_single_image(index_name, image_url, image_description, image_vector):
+    doc = {
+        "image_url": image_url,
+        "image_description": image_description,
+        "image_vector": image_vector
+    }
+    es.index(index=index_name, document=doc)
+    print(f"图片 {image_url} 已插入")
+
+
+# 批量插入图片数据
+def insert_bulk_images(index_name, image_data_list):
+    actions = []
+    for data in image_data_list:
+        action = {
+            "_index": index_name,
+            "_source": {
+                "image_url": data["image_url"],
+                "image_description": data["image_description"],
+                "image_vector": data["image_vector"]
+            }
+        }
+        actions.append(action)
+    bulk(es, actions)
+    print(f"批量插入 {len(actions)} 条数据成功")
+
+
+# 文本检索
+def text_search(index_name, query_text, size=10):
+    query = {
+        "match": {
+            "image_description": query_text
+        }
+    }
+    results = es.search(index=index_name, query=query, size=size)
+    return results['hits']['hits']
+
+
+# 向量检索
+def vector_search(index_name, query_vector, size=10):
+    query = {
+        "script_score": {
+            "query": {"match_all": {}},
+            "script": {
+                "source": "cosineSimilarity(params.query_vector, 'image_vector') + 1.0",
+                "params": {"query_vector": query_vector}
+            }
+        }
+    }
+    results = es.search(index=index_name, query=query, size=size)
+    return results['hits']['hits']
+
+
+# 向量-文本联合检索
+def hybrid_search(index_name, query_text, query_vector, text_weight=0.5, vector_weight=0.5, size=10):
+    query = {
+        "bool": {
+            "should": [
+                {
+                    "match": {
+                        "image_description": {
+                            "query": query_text,
+                            "boost": text_weight
+                        }
+                    }
+                },
+                {
+                    "script_score": {
+                        "query": {"match_all": {}},
+                        "script": {
+                            "source": "cosineSimilarity(params.query_vector, 'image_vector') * params.vector_weight",
+                            "params": {
+                                "query_vector": query_vector,
+                                "vector_weight": vector_weight
+                            }
+                        }
+                    }
+                }
+            ]
+        },
+
+    }
+    results = es.search(index=index_name, query=query, size=size)
+    return results['hits']['hits']
+
+
+# RRF融合函数
+def rrf_fusion(text_results, vector_results, k=60):
+    """
+    使用RRF算法融合两种检索结果
+    参数:
+        text_results: es文本检索结果
+        vector_results: es向量检索结果
+        k: RRF算法中的rank常数，一般取60
+    返回:
+        融合后的结果列表
+    """
+    # 构建文档ID到文本检索排名的映射
+    text_rank_map = {hit['_id']: rank + 1 for rank, hit in enumerate(text_results)}
+
+    # 构建文档ID到向量检索排名的映射
+    vector_rank_map = {hit['_id']: rank + 1 for rank, hit in enumerate(vector_results)}
+
+    # 合并所有文档ID
+    all_doc_ids = set(text_rank_map.keys()).union(set(vector_rank_map.keys()))
+
+    # 计算RRF分数
+    rrf_scores = {}
+    for doc_id in all_doc_ids:
+        text_rank = text_rank_map.get(doc_id, float('inf'))
+        vector_rank = vector_rank_map.get(doc_id, float('inf'))
+
+        # RRF公式: 1/(k + rank)
+        text_score = 1 / (k + text_rank) if text_rank != float('inf') else 0
+        vector_score = 1 / (k + vector_rank) if vector_rank != float('inf') else 0
+
+        rrf_scores[doc_id] = text_score + vector_score
+
+    # 按RRF分数排序
+    sorted_doc_ids = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
+
+    # 构建最终结果
+    final_results = []
+    # 优先从文本结果中获取完整信息
+    id_to_hit = {hit['_id']: hit for hit in text_results}
+    # 补充向量结果中的文档
+    for hit in vector_results:
+        if hit['_id'] not in id_to_hit:
+            id_to_hit[hit['_id']] = hit
+
+    for doc_id, score in sorted_doc_ids:
+        if doc_id in id_to_hit:
+            result = {
+                '_id': doc_id,
+                '_score': score,
+                '_source': id_to_hit[doc_id]['_source']
+            }
+            final_results.append(result)
+
+    return final_results
+
+
+# RRF向量-文本联合检索
+def hybrid_search_rrf(index_name, query_text, query_vector, size=10, rrf_k=60):
+    """使用RRF融合文本和向量检索结果"""
+    # 执行文本和向量检索
+    text_hits = text_search(index_name, query_text)
+    vector_hits = vector_search(index_name, query_vector)
+
+    # 使用RRF融合结果
+    fused_results = rrf_fusion(text_hits, vector_hits, k=rrf_k)
+
+    # 返回前size个结果
+    return fused_results[:size]
+
+
+# 示例用法
+if __name__ == "__main__":
+    INDEX_NAME = "tst_images_text"
+
+    # # 1、创建索引
+    create_index(INDEX_NAME)
+
+    # 示例数据
+    sample_images = [
+        {
+            "image_url": "https://oss.5rs.me/oss/upload/image/jpeg/b2f3fbaa98a3f2552f2dba7451decf7b_36_20250605135114454.jpeg",
+            "image_description": "、填一填。培优作业18 80 78 2.比 16 少9的数是( )多8的数是 17;27 );比( 与 30的和是( );47比3多( )个十和( )个一;7个十和2个一 3.83 里面有( 合起来是( )，它们 4.最大的两位数是(),最小的三位数是(相差(5.写一写,比一比。百位十位个位百位十位个位百位十位个位选择。(把正确答案的序号填在括号里)1.根据艾蒿叶子的影子判断生长时间最短的叶子是(2.在远古时代,为了记录猎物的多少,人们常用石子计数。假设用0表示10.用一表示1,那么下面表示的数最大的是",
+            "image_vector": np.random.random(2048).tolist()  # 替换为实际的图片向量
+        },
+        {
+            "image_url": "https://oss.5rs.me/oss/upload/image/jpeg/9579d101ec560c85f204fa11b7a792c0_2_20250605094300322.jpeg",
+            "image_description": "、直接写得数。培优作业1*18 x10=50 x90 =18 x41~51 x22~30x40 =12 x70 =32 x49 ~17 x78 ~20 x15 =44 x20 =78 x11~46 x23 ~二、填一填。1.13的30倍是(),10个22的和是()位数,积大约是(2.49 x28 的积是()。3.25x40的积的末尾有()个0。4.小亮平均每分钟走69米,他从家走到学校用了12分钟。小亮家离学校大约有()米。5.晶晶平均每分钟可以写25个字,照这样的速度,她15分钟可以写)个字。三、用竖式计算,并验算。18 x51 =32 x60 =46 x25 =四、在( )里填上“ >”“<”或“=”30 x30( )100070x12()80034 x56)36 x5415 x40( )60011 x44(訕興癪9 x13)450)13 x8921 x3425 x801圧ʌ帳八廈仈 x30)600)200023 x45",
+            "image_vector": np.random.random(2048).tolist()  # 替换为实际的图片向量
+        }
+    ]
+
+    # # 2、批量插入
+    # insert_bulk_images(INDEX_NAME, sample_images)
+
+    # # 3、文本检索示例
+    print("\n3、文本检索结果:")
+    text_results = text_search(INDEX_NAME, "填一填")
+    for hit in text_results:
+        print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
+        print()
+
+    # # 4、向量检索示例
+    print("\n4、向量检索结果:")
+    vector_results = vector_search(INDEX_NAME, sample_images[0]["image_vector"])
+    for hit in vector_results:
+        print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
+        print()
+
+    # # 5、联合检索示例
+    print("\n5、联合检索结果:")
+    hybrid_results = hybrid_search(
+        INDEX_NAME,
+        "可爱的动物",
+        sample_images[0]["image_vector"],
+        text_weight=0.4,
+        vector_weight=0.6
+    )
+    for hit in hybrid_results:
+        print(f"得分: {hit['_score']}, \n描述: {hit['_source']['image_description']}")
+        print()
+
+    # # 6、RRF联合检索示例
+    print("\n6、RRF联合检索结果:")
+    hybrid_results = hybrid_search_rrf(
+        INDEX_NAME,
+        "填一填。培优作业18 80 78 2.比 16 少9的数是( )多8的数是 17;27 );比( 与 30的和是( );47比3多( )个十和( )个一;7个十和2个",
+        sample_images[0]["image_vector"],
+        size=10,
+        rrf_k=60
+    )
+    for hit in hybrid_results:
+        print(f"得分: {hit['_score']:.4f}, \n描述: {hit['_source']['image_description']}")
+        print()
+
+
+    print("\nfinished.")
+
--- a/paddle_ocr.py
+++ b/paddle_ocr.py
+from tritonclient import grpc as triton_grpc
+import numpy as np
+import json
+from PIL import Image
+import io
+import base64
+import time
+INPUT_NAME = "input"
+OUTPUT_NAME = "output"
+
+def _create_triton_input(data):
+    data = json.dumps(data, separators=(",", ":"))
+    data = data.encode("utf-8")
+    data = [[data]]
+    data = np.array(data, dtype=np.object_)
+    return data
+def _parse_triton_output(data):
+    data = data[0, 0]
+    data = data.decode("utf-8")
+    data = json.loads(data)
+    return data
+
+def triton_request(client, data, *, request_kwargs=None):
+    if request_kwargs is None:
+        request_kwargs = {}
+    input_ = triton_grpc.InferInput(INPUT_NAME, [1, 1], "BYTES")
+    input_.set_data_from_numpy(_create_triton_input(data))
+    results = client.infer('ocr', inputs=[input_], **request_kwargs)
+    output = results.as_numpy(OUTPUT_NAME)
+    return _parse_triton_output(output)
+
+def infer_paddle_ocr(im:Image.Image, client):
+    buffer = io.BytesIO()
+    im.save(buffer, format='JPEG')
+    buffer.seek(0)
+    base64_file = base64.b64encode(buffer.read()).decode("ascii")
+    input_ = {"file": base64_file,
+              "fileType": 1}
+    while True:
+        output_ocr = triton_request(client, input_)
+        errorCode = output_ocr["errorCode"]
+        if errorCode == 0:
+            result = output_ocr["result"]
+            res_ocr = result["ocrResults"][0]['prunedResult']
+            db_boxes = res_ocr['rec_boxes']
+            rec_texts = res_ocr['rec_texts']
+            break
+    return db_boxes, rec_texts
+
+
+if __name__=='__main__':
+    start_time = time.time()
+    paddle_ocr_url = '61.170.32.8:38896'
+    client_ocr = triton_grpc.InferenceServerClient(paddle_ocr_url)
+    im = '/data/liyaze/doc2xkey_server_v2/utils/my-obs-object-key-demo.jpg'
+    im = "/home/liuxin/work/search_book/data/12664728-课计划·七年级英语·RJ·上册/1.jpg"
+    im = "/home/liuxin/work/search_question/data/12664728-课计划·七年级英语·RJ·上册/93.jpg"
+    im = Image.open(im).convert('RGB')
+    db_boxes, rec_texts = infer_paddle_ocr(im, client_ocr)
+    print(f"耗时：{time.time()-start_time}")
+    print(rec_texts)
\ No newline at end of file
--- a/prepare_data.py
+++ b/prepare_data.py
+import re,json,sys,os
+import pandas as pd
+import numpy as np
+
+
+class prepareData():
+    def __init__(self):
+
+        self.index = "index"                          # int
+        self.fileId = "bankId"                        # int
+        self.question_image_url = "question_image_url"
+        self.question_text = "question_text"
+        self.answer_image_url = "answer_image_url"
+        self.answer_text = "answer_text"
+        self.question_chapter = "question_chapter"
+        self.question_category = "question_category"
+        self.page_id = "page_id"                      # int
+        self.book_page_id = "book_page_id"
+        self.knowledge_point = "knowledge_point"
+        self.knowledge_point_code = "knowledge_point_code"
+        self.difficult = "difficult"
+        self.resolve = "resolve"
+        self.appendix = "appendix"
+        self.source_image_url = "source_image_url"
+        self.page_ocr = "page_ocr"
+        self.question_ocr = 'question_ocr'
+
+        self.title2tile = {
+            "序号": self.index,
+            "题目id": self.fileId,
+            "题目图片地址": self.question_image_url,
+            "题目文本": self.question_text,
+            "答案图片地址": self.answer_image_url,
+            "答案文本": self.answer_text,
+            "所属章节": self.question_chapter,
+            "题目类型": self.question_category,
+            "电子页码": self.page_id,
+            "纸书页码": self.book_page_id,
+            "知识点": self.knowledge_point,
+            "知识点代码": self.knowledge_point_code,
+            "难度": self.difficult,
+            "解析": self.resolve,
+            "附件": self.appendix,
+            "source_image_url": self.source_image_url,
+            "page_ocr": self.page_ocr,
+            "question_ocr": self.question_ocr,
+        }
+
+    def clean(self, text):
+        text = re.sub(r'<img[^>]*>', '', text)
+        text = re.sub(r"</p><p>", " ", text)
+        text = re.sub(r"<p>", " ", text)
+        text = re.sub(r"</p>", " ", text)
+        text = text.strip()
+
+        return text
+    def read_excel(self,excel_path, sheet_name="Sheet1", book_id=None, book_name=None):
+        data = pd.read_excel(excel_path, sheet_name)
+        data = data.rename(columns=self.title2tile)
+        data.fillna("")
+        data = data.to_dict(orient='records')
+        page_id2bank_id = {} # 整页的所有题目
+        for item in data:
+            item[self.index] = int(item[self.index])
+            item[self.fileId] = int(item[self.fileId])
+            item[self.question_image_url] = str(item[self.question_image_url])
+            item[self.question_text] = str(item[self.question_text])
+            item[self.answer_image_url] = str(item[self.answer_image_url])
+            item[self.answer_text] = str(item[self.answer_text])
+            item[self.question_chapter] = str(item[self.question_chapter])
+            item[self.question_category] = str(item[self.question_category])
+            page_id = int(item[self.page_id])
+            item[self.page_id] = int(item[self.page_id])
+            item[self.book_page_id] = str(item[self.book_page_id])
+            item[self.knowledge_point] = str(item[self.knowledge_point])
+            item[self.knowledge_point_code] = str(item[self.knowledge_point_code])
+            item[self.difficult] = str(item[self.difficult])
+            item[self.resolve] = str(item[self.resolve])
+            item[self.appendix] = str(item[self.appendix])
+            item['book_id'] = str(book_id)
+            item['book_name'] = book_name
+            item['book_id_page_id']=str(book_id)+"-"+str(page_id)
+
+            item.pop('question_category')
+            item.pop('appendix')
+
+            # 1、整页图片的paddleOCR结果
+            # page_ocr = json.loads(item[self.page_ocr])['rec_texts'] # paddleOCR
+            # page_ocr = " ".join(page_ocr)
+            page_ocr = item[self.page_ocr]  # 合合OCR
+            item[self.page_ocr] = page_ocr
+
+            # 2、题目的paddleOCR结果
+            # question_ocr = json.loads(item[self.question_ocr])['rec_texts']  # paddleOCR
+            # question_ocr = " ".join(question_ocr)
+
+            question_ocr = item[self.question_ocr]  # 合合OCR
+            item[self.question_ocr] = question_ocr
+
+            # 3、整页图片所有题目id
+            bank_id = int(item[self.fileId])
+            if page_id in page_id2bank_id:
+                page_id2bank_id[page_id].append(bank_id)
+            else:
+                page_id2bank_id[page_id] = [bank_id]
+
+            # 4、原始题目 图片 embedding
+            if "question_image_embedding" in item.keys():
+                item['question_image_embedding'] = json.loads(item['question_image_embedding'])
+            # 5、整页 图片 embedding
+            if "page_image_embedding" in item.keys():
+                item['page_image_embedding'] = json.loads(item['page_image_embedding'])
+
+        for item in data:
+            page_id = int(item[self.page_id])
+            bank_ids = page_id2bank_id[page_id]   # [12,2134,321]
+            item['page_id2bank_ids'] = bank_ids   # 整页图片所有题目id
+
+        return data
+
+
+    def write2json(self, path, data_list):
+        with open(path, "w", encoding='utf-8') as f:
+            data_list = json.dumps(data_list, ensure_ascii=False, indent=4)
+            f.write(data_list)
+
+            print(f"success save: {path} ")
+
+    def result(self):
+        excel_file = "/home/liuxin/work/search_question/data/12671977-暑假生活·学期总复习八年级物理通用版6-原图/12671977-暑假生活·学期总复习八年级物理通用版6.xlsx"
+        sheet_name = "题库数据"
+        book_id = 12671977
+        book_name = '暑假生活·学期总复习八年级物理通用版6'
+        data_list = self.read_excel( excel_file, sheet_name=sheet_name, book_id=book_id, book_name=book_name)
+        save_json_path = excel_file+".json"
+        self.write2json(save_json_path, data_list)
+
+
+if __name__ == "__main__":
+    prepare_data = prepareData()
+    prepare_data.result()
+
+
+    print("\nfinished.")
+
+
+
+
--- a/save_es_database.py
+++ b/save_es_database.py
--- a/tools.py
+++ b/tools.py
--- a/tst_search_result.py
+++ b/tst_search_result.py
--- a/人工标注的手机拍题excel/12677471 数学.pdf.xlsx
+++ b/人工标注的手机拍题excel/12677471 数学.pdf.xlsx
--- a/人工标注的手机拍题excel/12677472 语文.pdf (2).xlsx
+++ b/人工标注的手机拍题excel/12677472 语文.pdf (2).xlsx
--- a/人工标注的手机拍题excel/~$历史 12676950.pdf.xlsx
+++ b/人工标注的手机拍题excel/~$历史 12676950.pdf.xlsx
--- a/人工标注的手机拍题excel/化学 12668851.pdf.xlsx
+++ b/人工标注的手机拍题excel/化学 12668851.pdf.xlsx
--- a/人工标注的手机拍题excel/化学12673496.pdf.xlsx
+++ b/人工标注的手机拍题excel/化学12673496.pdf.xlsx
--- a/人工标注的手机拍题excel/历史 12676950.pdf.xlsx
+++ b/人工标注的手机拍题excel/历史 12676950.pdf.xlsx
--- a/人工标注的手机拍题excel/数学12673486.pdf.xlsx
+++ b/人工标注的手机拍题excel/数学12673486.pdf.xlsx
--- a/人工标注的手机拍题excel/数学12678070.pdf.xlsx
+++ b/人工标注的手机拍题excel/数学12678070.pdf.xlsx
--- a/人工标注的手机拍题excel/物理 12677462.pdf.xlsx
+++ b/人工标注的手机拍题excel/物理 12677462.pdf.xlsx
--- a/人工标注的手机拍题excel/物理12673663.pdf.xlsx
+++ b/人工标注的手机拍题excel/物理12673663.pdf.xlsx
--- a/人工标注的手机拍题excel/生物 12677035.pdf.xlsx
+++ b/人工标注的手机拍题excel/生物 12677035.pdf.xlsx
--- a/人工标注的手机拍题excel/英语12673493.pdf.xlsx
+++ b/人工标注的手机拍题excel/英语12673493.pdf.xlsx
--- a/人工标注的手机拍题excel/语文 12668836.pdf.xlsx
+++ b/人工标注的手机拍题excel/语文 12668836.pdf.xlsx