Commit 6d5baff6 by unknown

init

parents
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os,re,json,sys
__path__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__path__)
sys.path.append(os.path.join(__path__, 'web_search_source'))
import uvicorn
import asyncio
import logging
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse
from fastapi.exceptions import RequestValidationError
from pydantic import BaseModel
from save_es_database import ESsearch
from web_search_source.web_search_resource import webSearchResource
def setup_logger():
# 创建logger对象
logger = logging.getLogger('AI_planner')
logger.setLevel(logging.INFO)
# 确保日志目录存在
__path__ = os.path.dirname(os.path.abspath(__file__))
log_dir = os.path.join(__path__, "log")
if not os.path.exists(log_dir):
os.makedirs(log_dir)
log_file = os.path.join(log_dir, "AI_planner.log")
# 创建文件处理器
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setLevel(logging.INFO)
# 创建控制台处理器
# console_handler = logging.StreamHandler()
# console_handler.setLevel(logging.INFO)
# 设置日志格式
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
# console_handler.setFormatter(formatter)
# 添加处理器
logger.addHandler(file_handler)
# logger.addHandler(console_handler)
return logger
# 创建全局logger实例
logger_es = setup_logger()
class Item(BaseModel):
tableName: str = "" # sop表名称
bookCategory: str = "" # 图书类别:K12,童书,其他图书
schoolStage: str = "" # 学段 “初中” “小学” “高中”
studentGrade: str = "" # 年级
subjectCategory: str = "" # 学科 语文 数学
bookVersion: str = "" # 版本 如 人教版 通用版
purpose: str = "" # 目标读者的目的
keyword: str = "" # 需要查询的关键字
class ItemWebSearch(BaseModel):
bookName: str = "" # 书名
bookClassify: str = "" # 图书类别
author: str = "" # 图书作者
introduction: str = "" # 图书简介
app = FastAPI()
es_search = ESsearch(hosts=['http://localhost:9200'])
# 中文翻译为朝鲜文相关api接口
# 1、心跳检测
@app.get("/health/")
async def health():
res = JSONResponse(status_code=200, content={"message": "no AI source up."})
return res
# 2、es中 搜索 非AI资源
@app.post("/no_ai_source/")
async def no_ai_source(input: Item):
# print("input: ",input)
"""
接收一字典参数,返回es搜索的 非AI资源清单
"""
try:
input = json.loads(input.json())
es_res = es_search.search(input)
logger_es.info(f"ES search no_ai_source input : {input} ; \nes_search : {json.dumps(es_res, ensure_ascii=False, indent=4)}")
res = JSONResponse(status_code=200, content=es_res)
except Exception as e:
logger_es.error(f" no_ai_source input : {input}; error message : {e}")
res = JSONResponse(
status_code=500,
content={"message": str(e)},
)
return res
if __name__ == "__main__":
uvicorn.run(app="api_service:app", host="0.0.0.0", port=9860, workers=1) # 部署的服务是 9860
# 启动api服务 116.63.110.220 服务器
# netstat -ntlp | grep 9860
# cd /home/liuxin/work/AI_planner
# conda activate translate
# nohup python -u api_service.py > log/api_service.log 2>&1 &
# tail -f log/api_service.log
# tail -f log/AI_planner.log
# uvicorn api_service:app --host 0.0.0.0 --port 9860 --workers 1
This source diff could not be displayed because it is too large. You can view the blob instead.
import re, json, sys , os
import pandas as pd
# 非AI项目 资源清单 数据库 每条数据转为json字符串
def knowledge_json(file, sheet_names:list, save_file):
length_record = []
save_jsonl_data = []
for sheet_name in sheet_names:
data = pd.read_excel(file, sheet_name=sheet_name, keep_default_na=False)
data = data.to_dict(orient='records')
for line in data:
if "序号" in line:
line.pop("序号")
try:
line = json.dumps(line, ensure_ascii=False)
line = re.sub("\n", "", line)
line = re.sub("\t", "", line)
length_record.append(len(line))
if len(line) > 3000:
print(line)
save_jsonl_data.append(line)
except:
print(line)
res = "\n".join(save_jsonl_data)
with open(save_file, 'w', encoding='utf-8') as f:
f.write(res)
print(f"max length: {max(length_record)}")
if __name__ == "__main__":
file = r"D:\0_shu_chuan_work\work\AI_planner\data\非AI工具.xlsx"
save_file = r"D:\0_shu_chuan_work\work\AI_planner\data\伴学工具0.txt" # max length: 157
sheet_names = ['伴学工具20250214'] # max length: 157
sheet_names = ['第三方自有资源评级详表'] # max length: 429
sheet_names = ['品牌资源(爱奇艺+慕课)'] # max length: 333
sheet_names = ['小睿资讯服务'] # max length: 2916
sheet_names = ['测评库资源'] # max length: 259
knowledge_json(file, sheet_names, save_file)
print("finished.")
import copy
import re,json,os,sys
import pandas as pd
import copy
class prepareESdata():
# 整理非AI资源sop中的数据,方便保存进es数据库
def __init__(self):
self.head2key = []
pass
def clean(self, text):
text = text.strip()
text = re.sub("“", " ", text)
text = re.sub("”", " ", text) # “”
text = re.sub('"', " ", text) # ""'
text = re.sub("'", " ", text)
return text
def add_message(self, message):
add_info = []
if "中考" in message:
add_info.append('初三')
if "幼小衔接" in message:
add_info.append('一年级')
if "五六年级" in message:
add_info.append('五年级')
add_info.append('六年级')
if "中小学" in message:
add_info.append('小学')
add_info.append('初中')
if "初升高" in message:
add_info.append('初三')
if "中小学" in message:
add_info.append('小学')
add_info.append('初中')
if "1年级" in message:
add_info.append('一年级')
if "2年级" in message:
add_info.append('二年级')
if "3年级" in message:
add_info.append('三年级')
if "1-2年级" in message:
add_info.append('一年级')
add_info.append('二年级')
if "3-6年级" in message:
add_info.append('三年级')
add_info.append('四年级')
add_info.append('五年级')
add_info.append('六年级')
if "初升高" in message:
add_info.append('初三')
add_info = list(set(add_info))
add_info = ";".join(add_info)
return add_info
def study_tool(self, excel_path, sheet_name):
# 1、伴学工具
head2key1 = {"raysAppId":"raysAppId", "typeCode":"typeCode", "typeName":"resourceName",
"subjectType":"subjectCategory", "title":"resourceDescribe"}
self.head2key.append(head2key1)
key_value = {"bookCategory":"K12", "tableName":"伴学工具"}
data = pd.read_excel(excel_path, sheet_name=sheet_name)
data.fillna("")
data = data.to_dict(orient='records')
save_data = []
for item in data:
temp = copy.deepcopy(key_value)
for k, v in item.items():
v = str(v)
if v == "NaT":
v = ""
if k in head2key1.keys() :
temp[head2key1[k]] = self.clean( v)
message = [temp['bookCategory'], temp['subjectCategory'], temp['resourceName'], temp['resourceDescribe']]
message = [item for item in message if item]
message = list(set(message))
message = ";".join(message)
temp['message'] = message + ";" + self.add_message(message)
save_data.append(temp)
return save_data
def third_party_own_resource(self, excel_path, sheet_name):
# 2、第三方自有资源评级详表
head2key2 = {"资源编号": "typeCode", "创建时间": "createTime", "资源名称": "resourceName", "作品类型": "workType",
"作者名称": "authorName", "资源简介": "resourceDescribe", "图书类型/受众": "bookCategory",
"学段": "schoolStage", "年级": "studentGrade", "学科": "subjectCategory", "内容": "resourceContent",
"版本": "bookVersion","科目/标签":"subjectORlabel", "年级/深度":"gradeDeep", "目的": "purpose",
"学期":"studentSemester", "版本.1":"textBookEdition", "销售价": "bookPrice"
}
self.head2key.append(head2key2)
data = pd.read_excel(excel_path, sheet_name).fillna("")
data = data.to_dict(orient='records')
save_data = []
key_value = {"tableName":"第三方自有资源评级详表"}
for item in data:
temp = copy.deepcopy(key_value)
if "缺" in item['资源简介']:
continue
for k, v in item.items():
v = str(v)
if v == "NaT":
v = ""
if k=='创建时间':
v = str(v)[:10]
if k in head2key2.keys() :
temp[head2key2[k]] = self.clean( v)
message = [temp['bookCategory'], temp['resourceName'], temp['workType'], temp['resourceDescribe'], temp['schoolStage'],
temp['studentGrade'], temp['subjectCategory'], temp['resourceContent'], temp['bookVersion'], temp['subjectORlabel'],
temp['gradeDeep'], temp['purpose'],temp['studentSemester'], temp['textBookEdition'],
]
message = [item for item in message if item]
message = list(set(message))
message = ";".join(message)
temp['message'] = message+ ";" + self.add_message(message)
save_data.append(temp)
return save_data
def brand_resource(self, excel_path, sheet_name):
# 3 品牌资源(爱奇艺+慕课)
head2key3 = {"资源编号": "typeCode", "创建时间": "createTime", "作品名称": "resourceName", "作品类型": "workType",
"作者名称": "authorName", "图书类型/受众": "bookCategory", "科目/标签":"subjectORlabel", "年级/深度":"gradeDeep",
"外链销售价": "bookPrice"
}
self.head2key.append(head2key3)
data = pd.read_excel(excel_path, sheet_name).fillna("")
data = data.to_dict(orient='records')
save_data = []
key_value = {"tableName":"品牌资源(爱奇艺+慕课)"}
for item in data:
temp = copy.deepcopy(key_value)
for k, v in item.items():
v = str(v)
if v == "NaT":
v = ""
if k == '创建时间':
v = str(v)[:10]
if k in head2key3.keys() :
temp[head2key3[k]] = self.clean( v)
message = [
temp['resourceName'], temp['workType'], temp['bookCategory'], temp['subjectORlabel'], temp['gradeDeep']
]
message = [item for item in message if item]
message = list(set(message))
message = ";".join(message)
temp['message'] = message+ ";" + self.add_message(message)
save_data.append(temp)
return save_data
def xiaorui_information(self, excel_path, sheet_name):
# 4、小睿资讯服务
head2key4 = {"资讯编号": "typeCode", "创建时间": "createTime", "资讯标题(26字内)": "resourceName", "资讯来源":"resourceFrom",
"受众": "audience", "科目/标签": "subjectORlabel", "目的": "purpose", "bookCategory":"bookCategory"
}
self.head2key.append(head2key4)
data = pd.read_excel(excel_path, sheet_name).fillna("")
data = data.to_dict(orient='records')
save_data = []
key_value = {"tableName": "小睿资讯服务"}
for item in data:
temp = copy.deepcopy(key_value)
for k, v in item.items():
v=str(v)
if v == "NaT":
v = ""
if k == '创建时间':
v = str(v)[:10]
if v=="NaT":
v= ""
if k == "bookCategory" and v not in ["K12", "童书"]:
v = "其他图书"
if k in head2key4.keys() :
temp[head2key4[k]] = self.clean( v)
message = [temp['bookCategory'], temp['resourceName'], temp['resourceFrom'], temp['subjectORlabel'],
temp['audience'], temp['purpose'],
]
message = [item for item in message if item]
message = list(set(message))
message = ";".join(message)
temp['message'] = message+ ";" + self.add_message(message)
save_data.append(temp)
return save_data
def evaluation_database(self, excel_path, sheet_name):
# 5、测评库资源
head2key5 = {"资源编号": "typeCode", "创建时间": "createTime",
"作品名称": "resourceName", "资源简介": "resourceDescribe", "科目/标签": "subjectCategory", "年级/深度":"gradeDeep",
"图书类型/受众":"bookCategory"
}
self.head2key.append(head2key5)
data = pd.read_excel(excel_path, sheet_name).fillna("")
data = data.to_dict(orient='records')
save_data = []
key_value = {"tableName": "测评库资源"}
for item in data:
temp = copy.deepcopy(key_value)
for k, v in item.items():
v=str(v)
if v == "NaT":
v = ""
if k == '创建时间':
v = str(v)[:10]
if k == "图书类型/受众":
bookCategory = []
if "K12" in v:
bookCategory.append('K12')
if "童书" in v:
bookCategory.append('童书')
if "大众生活" in v:
bookCategory.append('其他图书')
if "不限" in v:
bookCategory.append('K12')
bookCategory.append('童书')
bookCategory.append('其他图书')
if not bookCategory:
bookCategory.append('其他图书')
bookCategory = list(set(bookCategory))
bookCategory = ";".join(bookCategory)
v = bookCategory
if k in head2key5.keys() :
temp[head2key5[k]] = self.clean( v)
message = [temp['bookCategory'], temp['resourceName'], temp['resourceDescribe'],
temp['subjectCategory'], temp['gradeDeep'],
]
message = [item for item in message if item]
message = list(set(message))
message = ";".join(message)
temp['message'] = message+ ";" + self.add_message(message)
save_data.append(temp)
return save_data
def existing_tool(self, excel_path, sheet_name):
# 6、已有工具(数独闯关不算)
# 大多是AI类工具 (不考虑写进es数据库)
head2key6 = {"资源编号": "typeCode", "工具名称": "resourceName", "介绍": "resourceDescribe", "科目/标签": "subjectCategory",
"年级/深度": "gradeDeep",
"图书类型/受众": "bookCategory"
}
# # self.head2key.append(head2key6)
data = pd.read_excel(excel_path, sheet_name).fillna("")
data = data.to_dict(orient='records')
save_data = []
key_value = {"tableName": "已有工具(数独闯关不算)"}
for item in data:
temp = copy.deepcopy(key_value)
for k, v in item.items():
v = str(v)
if v == "NaT":
v = ""
if k == '创建时间':
v = str(v)[:10]
if k == "bookCategory":
bookCategory = []
if "K12" in v:
bookCategory.append('K12')
if "童书" in v:
bookCategory.append('童书')
if "大总生活" in v:
bookCategory.append('其他图书')
if "不限" in v:
bookCategory.append('其他图书')
if not bookCategory:
bookCategory.append('其他图书')
bookCategory = ";".join(bookCategory)
v = bookCategory
if k in head2key6.keys():
temp[head2key6[k]] = self.clean( v)
message = [temp['bookCategory'], temp['resourceName'], temp['resourceDescribe'],
temp['subjectCategory'], temp['gradeDeep'],
]
message = [item for item in message if item]
message = list(set(message))
message = ";".join(message)
temp['message'] = message+ ";" + self.add_message(message)
save_data.append(temp)
return save_data
def result(self):
save_sheets = []
save_sheets_name = []
path = r"/home/liuxin/work/AI_planner/data/非AI工具"
# 1、伴学工具
excel_path = f"/home/liuxin/work/AI_planner/data/非AI工具/伴学工具-2025-02-14.xlsx"
sheet_name = 'Sheet1'
study_tool = self.study_tool( excel_path, sheet_name) # 伴学工具
data = pd.DataFrame(study_tool)
save_sheets.append(copy.deepcopy(data))
save_sheets_name.append("伴学工具")
study_tool_demp = study_tool[:4]
print(json.dumps(study_tool_demp, ensure_ascii=False, indent=4))
# 2、第三方自有资源评级详表
excel_path = f"/home/liuxin/work/AI_planner/data/非AI工具/商务资源SOP管理表read.xlsx"
sheet_name = '第三方自有资源评级详表'
third_party = self.third_party_own_resource( excel_path, sheet_name)
data = pd.DataFrame(third_party)
save_sheets.append(copy.deepcopy(data))
save_sheets_name.append(sheet_name)
third_party_demp = third_party[:4]
print(json.dumps(third_party_demp, ensure_ascii=False, indent=4))
# 3、品牌资源(爱奇艺+慕课)
excel_path = f"/home/liuxin/work/AI_planner/data/非AI工具/商务资源SOP管理表read.xlsx"
sheet_name = '品牌资源(爱奇艺+慕课)'
brand_resource = self.brand_resource( excel_path, sheet_name)
data = pd.DataFrame(brand_resource)
save_sheets.append(copy.deepcopy(data))
save_sheets_name.append(sheet_name)
brand_resource_demp = brand_resource[:4]
print(json.dumps(brand_resource_demp, ensure_ascii=False, indent=4))
# 4、小睿资讯服务
excel_path = f"/home/liuxin/work/AI_planner/data/非AI工具/商务资源SOP管理表read.xlsx"
sheet_name = '小睿资讯服务'
xiaorui_information = self.xiaorui_information(excel_path, sheet_name)
data = pd.DataFrame(xiaorui_information)
save_sheets.append(copy.deepcopy(data))
save_sheets_name.append(sheet_name)
xiaorui_information_demp = xiaorui_information[:4]
print(json.dumps(xiaorui_information_demp, ensure_ascii=False, indent=4))
# 5、测评库资源
excel_path = f"/home/liuxin/work/AI_planner/data/非AI工具/商务资源SOP管理表read.xlsx"
sheet_name = '测评库资源'
evaluation_database = self.evaluation_database(excel_path, sheet_name)
data = pd.DataFrame(evaluation_database)
save_sheets.append(copy.deepcopy(data))
save_sheets_name.append(sheet_name)
evaluation_database_demp = evaluation_database[:4]
print(json.dumps(evaluation_database_demp, ensure_ascii=False, indent=4))
# 保存所欲 非AI资源进 json字典中
json_data = study_tool + third_party + brand_resource + xiaorui_information + evaluation_database
print(f"准备了{len(json_data)} 条数据写入es数据库.") # 准备了18137 条数据写入es数据库.
file = os.path.join(path, "商务资源SOP管理表es.json")
with open(file, "w", encoding='utf-8') as f:
for id, item in enumerate(json_data):
item['id'] = id
f.write(json.dumps(json_data, ensure_ascii=False, indent=4))
# 将数据保存进excel的多个表中
with pd.ExcelWriter(excel_path[:-5] + "_es" + ".xlsx") as writer:
for sheet_name, data in zip(save_sheets_name, save_sheets):
data.to_excel(writer, sheet_name=sheet_name, index=False)
# 所有的字段名称
keys_all = []
for source in [study_tool[0], third_party[0], brand_resource[0], xiaorui_information[0], evaluation_database[0]]:
for k,v in source.items():
keys_all.append(k)
keys_all = list(set(keys_all))
print(keys_all) # ['id', 'studentSemester', 'audience', 'studentGrade', 'bookPrice', 'resourceDescribe', 'resourceContent', 'subjectORlabel', 'typeCode', 'tableName', 'gradeDeep', 'bookCategory', 'resourceFrom', 'createTime', 'raysAppId', 'resourceName', 'workType', 'schoolStage', 'bookVersion', 'purpose', 'message', 'authorName', 'subjectCategory', 'textBookEdition']
print(len(keys_all)) # 24
if __name__ == "__main__":
prepare_es_data = prepareESdata()
prepare_es_data.result()
print("\nfinished.")
# !/usr/bin/env python
# -*- coding: utf-8 -*-
import copy
import json,re,sys,os
from elasticsearch import Elasticsearch, helpers # elasticsearch==7.0.0; pandas==2.2.3; numpy==2.0.2
import jieba
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
# 将 非AI资源 json 数据保存进es数据库
class EsHelper:
def __init__(self, hosts=None):
"""初始化ES连接
Args:
hosts: ES服务器地址列表,默认为None使用已配置的地址
"""
if hosts is None:
self.hosts = [{'host': '116.63.110.220', 'port': 9200}]
self.hosts = [{'host': 'localhost', 'port': 9200}]
self.hosts = ['http://localhost:9200'] # http://116.63.110.220:9200
self.es = Elasticsearch(hosts=self.hosts, timeout=1000)
print("self.es.ping(): ", self.es.ping())
def create_index(self, index_name):
"""创建索引
Args:
index_name: es索引名称
settings: 索引设置
mappings: 索引映射
['tableName','bookCategory', 'typeCode', 'raysAppId', 'createTime', 'resourceName', 'subjectCategory', 'studentSemester',
'audience', 'studentGrade', 'resourceDescribe', 'resourceContent', 'subjectORlabel',
'gradeDeep', 'resourceFrom',
'workType', 'schoolStage', 'bookVersion', 'purpose', 'authorName', 'textBookEdition', 'message', 'bookPrice'
]
Returns:
创建结果
"""
body = {}
settings = {
"index": {
"number_of_shards": 1, # 定义索引的分片数量 方便在不同的节点部署
"number_of_replicas": 0, # 定义索引中每个主分片的副本数量
}}
mappings = {
"dynamic": True,
"properties": {
"tableName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"bookCategory": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"typeCode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"raysAppId": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"createTime": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"resourceName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"subjectCategory": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"studentSemester": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"audience": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"studentGrade": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}, # 10
"resourceDescribe": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"resourceContent": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"subjectORlabel": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"gradeDeep": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"resourceFrom": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"workType": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schoolStage": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"bookVersion": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"purpose": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"authorName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}, # 20
"textBookEdition": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"message": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"bookPrice": {
"type": "float",
},
}
}
if settings:
body['settings'] = settings
if mappings:
body['mappings'] = mappings
res = self.es.indices.create(index=index_name, body=body, timeout='100s')
print(f"create index {index_name} success.")
return res
def delete_index(self, index_name):
"""删除索引
Args:
index_name: 索引名称
Returns:
删除结果
"""
res = self.es.indices.delete(index=index_name)
print(f"删除索引 {index_name} success.")
return res
def index_exists(self, index_name):
"""判断索引是否存在
Args:
index_name: 索引名称
Returns:
bool: 是否存在
"""
return self.es.indices.exists(index=index_name)
def read_dict(self, file):
# 1、读入中朝大辞典 ocr解析 并 gpt4o清洗后的数据
with open(file, 'r', encoding='utf-8') as f:
data_list = json.loads(f.read())
res = []
for item in data_list:
temp = {}
for k, v in item.items():
if not v:
pass
else:
temp[k] = v
res.append(copy.deepcopy(temp))
print("len(data_list): ", len(data_list))
print("len(res): ", len(res))
return res
def insert_doc(self, index_name, doc_body, doc_id=None):
"""插入文档
Args:
index_name: 索引名称
doc_body: 文档内容
doc_id: 文档ID,可选
Returns:
插入结果
"""
return self.es.index(index=index_name, body=doc_body, id=doc_id)
def insert_doc_batch(self, index_name, doc_batch: list):
# 批量插入es数据库
actions = []
for idx, body in enumerate(doc_batch):
if 'id' in body:
idx = body['id']
temp = {"_index": index_name, "_id": idx}
temp['_source'] = body
actions.append(temp)
# 使用helpers.bulk方法批量插入文档
helpers.bulk(self.es, actions)
def get_doc(self, index_name, doc_id):
"""获取文档
Args:
index_name: 索引名称
doc_id: 文档ID
Returns:
文档内容
"""
return self.es.get(index=index_name, id=doc_id)
def update_doc(self, index_name, doc_id, doc_body):
"""更新文档
Args:
index_name: 索引名称
doc_id: 文档ID
doc_body: 更新的内容
Returns:
更新结果
"""
return self.es.update(index=index_name, id=doc_id, body={'doc': doc_body})
def delete_doc(self, index_name, doc_id):
"""删除文档
Args:
index_name: 索引名称
doc_id: 文档ID
Returns:
删除结果
"""
return self.es.delete(index=index_name, id=doc_id)
def search(self, index_name, query_body):
"""搜索文档
"""
return self.es.search(index=index_name, body=query_body)
class ESsearch():
# 搜索es数据库
def __init__(self, hosts=['http://localhost:9200']):
self.es = Elasticsearch(hosts=hosts, timeout=30)
self.index_name = 'no_ai_source'
self.size_study_tool = 5 # 伴学工具(这个表没有使用)
self.size_third_party_own_resource = 50 # 第三方自有资源评级详表 100
self.size_brand_resource = 35 # 品牌资源(爱奇艺+慕课)
self.size_xiaorui_information = 10 # 小睿资讯服务
self.size_evaluation_database = 5 # 测评库资源(这个表没有使用)
# 【学段:"schoolStage" 小学】;【学科: "subjectCategory", 语文】;【年级: "studentGrade" 一年级】。
def execute_query(self, query=None, size=None, offset=0):
# 执行 非AI资源包 es查询语句
if query is None:
query = {"bool": {
"should": [{"match_phrase": {"message": "K12"}}]
}
}
query_body = {"from": offset,
"query": query
}
if size:
query_body['size'] = size
res = self.es.search(index=self.index_name, body=query_body)
res_hits = []
if 'hits' in res and 'hits' in res['hits']:
hits = res['hits']['hits']
for item in hits:
if "_source" in item and item['_source']:
res_hits.append(item['_source'])
for item in res_hits:
for k, v in item.items():
if isinstance(v, str):
v = re.sub('"', " ", v) # 英文双引号转中文双引号
v = re.sub("'", " ", v) # “
v = re.sub("“", " ", v) #
v = re.sub("”", " ", v) # ““” 中文双引号
item[k] = v
return res_hits
def search_study_tool(self, input):
# 1、es搜索 伴学工具 全部是K12的资源
query = {"bool": {"should": [],
"must":[]
}
}
must_query = [
{"match_phrase": {"tableName": "伴学工具"}},
{"match_phrase": {"bookCategory": input['bookCategory']}}
]
should_query = []
for k, v in input.items():
if not v:
continue
if k in [ "subjectCategory"] and v: # 学科 语数外
temp = {"match_phrase": {"message": v}}
must_query.append(temp)
else:
# temp = {"match": {"message": v}}
temp = {"match": {"resourceName": v}}
should_query.append(temp)
query['bool']['should'] = should_query
query['bool']['must'] = must_query
return query
def search_third_party_own_resource(self, input):
# 2、es搜索 第三方自有资源评级详表
# print("2、es搜索 第三方自有资源评级详表")
# print(input)
"""
input = {
"subjectCategory": "语文",
"schoolStage": "小学",
"studentGrade": "二年级",
"purpose": "基础学习",
"bookVersion": "人教版",
"keyword": "现代汉语;古代汉语;中国文学常识;外国文学常识;阅读;写作;教学"
}
"""
query = {"bool": {"should": [],
"must": []
}
}
must_query = [
{"match_phrase": {"tableName": "第三方自有资源评级详表"}},
{"match_phrase": {"bookCategory": input['bookCategory']}}
]
should_query = []
for k, v in input.items():
if not v:
continue
if input['subjectCategory']=="K12" and k in ["schoolStage", "studentGrade", "subjectCategory", "bookVersion"] and v:
v = v.strip().split(";")
v = list(set(v))
for v0 in v:
temp = {"match_phrase": {"message": v0}}
must_query.append(temp)
else:
# temp = {"match": {"message": v}}
temp = {"match": {"resourceName": v}}
should_query.append(temp)
query['bool']['should'] = should_query
query['bool']['must'] = must_query
return query
def search_brand_resource(self, input):
# 3、es搜索 品牌资源(爱奇艺+慕课)
query = {"bool": {"should": [],
"must": []
}
}
must_query = [
{"match_phrase": {"tableName": "品牌资源(爱奇艺+慕课)"}},
{"match_phrase": {"bookCategory": input['bookCategory']}}
]
should_query = []
for k, v in input.items():
if not v:
continue
if input['subjectCategory']=="K12" and k in ["schoolStage", "subjectCategory", "studentGrade"] and v:
v = v.strip().split(";")
v = list(set(v))
for v0 in v:
temp = {"match_phrase": {"message": v0}}
must_query.append(temp)
else:
# temp = {"match": {"message": v}}
temp = {"match": {"resourceName": v}}
should_query.append(temp)
query['bool']['should'] = should_query
query['bool']['must'] = must_query
return query
def search_xiaorui_information(self, input):
# 4、es搜索 小睿资讯服务
query = {"bool": {"should": [],
"must": []
}
}
must_query = [
{"match_phrase": {"tableName": "小睿资讯服务"}},
{"match_phrase": {"bookCategory": input['bookCategory']}}
]
should_query = []
for k, v in input.items():
if not v:
continue
if input['bookCategory']=="K12" and k in ["schoolStage", "subjectCategory", "studentGrade"] and v:
v = v.strip().split(";")
v = list(set(v))
for v0 in v:
temp = {"match_phrase": {"message": v0}}
must_query.append(temp)
else:
# temp = {"match": {"message": v}}
temp = {"match": {"resourceName": v}}
should_query.append(temp)
query['bool']['should'] = should_query
query['bool']['must'] = must_query
return query
def search_evaluation_database(self, input):
# 5、es搜索 测评库资源
query = {"bool": {"should": [],
"must": []
}
}
must_query = [
{"match_phrase": {"tableName": "测评库资源"}},
{"match_phrase": {"bookCategory": input['bookCategory']}}
]
should_query = []
for k, v in input.items():
if not v:
continue
if input['bookCategory']=="K12" and k in ["schoolStage", "subjectCategory", "studentGrade"] and v:
v = v.strip().split(";")
v = list(set(v))
for v0 in v:
if not v0:
continue
temp = {"match_phrase": {"message": v0}}
must_query.append(temp)
else:
# temp = {"match": {"message": v}}
temp = {"match": {"resourceName": v}}
should_query.append(temp)
query['bool']['should'] = should_query
query['bool']['must'] = must_query
return query
def search_no_ai_source(self, input):
query_study_tool = self.search_study_tool(input)
query_third_party_own_resource = self.search_third_party_own_resource(input)
query_brand_resource = self.search_brand_resource(input)
query_xiaorui_information = self.search_xiaorui_information(input)
query_evaluation_database = self.search_evaluation_database(input)
# K12 图书
if input['bookCategory'] == "K12":
# 并行执行5个函数
with ThreadPoolExecutor(max_workers=5) as executor:
res1 = executor.submit(self.execute_query, query=query_study_tool, size=self.size_study_tool)
res2 = executor.submit(self.execute_query, query=query_third_party_own_resource, size=self.size_third_party_own_resource)
res3 = executor.submit(self.execute_query, query=query_brand_resource, size=self.size_brand_resource)
res4 = executor.submit(self.execute_query, query=query_xiaorui_information, size=self.size_xiaorui_information)
res5 = executor.submit(self.execute_query, query=query_evaluation_database, size=self.size_evaluation_database)
# 等待所有任务完成并获取结果
res1 = res1.result()
res2 = res2.result()
res3 = res3.result()
res4 = res4.result()
res5 = res5.result()
res = res1 + res2 + res3 + res4 + res5
# 童书 和 其他图书
else:
# 并行执行4个函数
with ThreadPoolExecutor(max_workers=4) as executor:
res2 = executor.submit(self.execute_query, query=query_third_party_own_resource, size=self.size_third_party_own_resource)
res3 = executor.submit(self.execute_query, query=query_brand_resource, size=self.size_brand_resource)
res4 = executor.submit(self.execute_query, query=query_xiaorui_information, size=self.size_xiaorui_information)
res5 = executor.submit(self.execute_query, query=query_evaluation_database, size=self.size_evaluation_database)
# 等待所有任务完成并获取结果
res2 = res2.result()
res3 = res3.result()
res4 = res4.result()
res5 = res5.result()
res = res2 + res3 + res4 + res5
return res
def search(self, input):
# 后端调用的搜索接口
if input['bookCategory'] not in ["K12", "童书"]:
input['bookCategory'] = "其他图书"
if "tableName" not in input.keys() or not input['tableName']:
res = self.search_no_ai_source(input)
elif "tableName" in input.keys() and input['tableName'] =="伴学工具":
query_study_tool = self.search_study_tool(input)
res = self.execute_query( query = query_study_tool, size = self.size_study_tool)
elif "tableName" in input.keys() and input['tableName'] == "第三方自有资源评级详表":
query_third_party_own_resource = self.search_third_party_own_resource(input)
res = self.execute_query(query=query_third_party_own_resource, size=self.size_third_party_own_resource)
elif "tableName" in input.keys() and input['tableName'] == "品牌资源(爱奇艺+慕课)":
query_brand_resource = self.search_brand_resource(input)
res = self.execute_query(query=query_brand_resource, size=self.size_brand_resource)
elif "tableName" in input.keys() and input['tableName'] == "小睿资讯服务":
query_xiaorui_information = self.search_xiaorui_information(input)
res = self.execute_query(query=query_xiaorui_information, size=self.size_xiaorui_information)
elif "tableName" in input.keys() and input['tableName'] == "测评库资源":
query_evaluation_database = self.search_evaluation_database(input)
res = self.execute_query(query=query_evaluation_database, size=self.size_evaluation_database)
else:
res = [{"error":"tableName must in 【伴学工具;第三方自有资源评级详表;品牌资源(爱奇艺+慕课);小睿资讯服务;测评库资源】 "}]
# 踢出为空的字段
for item in res:
for k, v in item.items():
if not v:
item.pop(k)
return res
if __name__ == "__main__":
index_name = "no_ai_source" # 非AI资源 数据库
es = EsHelper()
# # #es.delete_index( index_name) # 删除es索引
# print(es.create_index("test_index", {"number_of_shards": 1, "number_of_replicas": 0}))
# print(es.search("test_index", {"query": {"match_all": {}}}))
# # # 1、创建新的es索引
# if es.index_exists( index_name):
# es.delete_index(index_name) # 删除索引
# create_index_res = es.create_index(index_name) # 建立一个新的es 索引
# print('create_index_res: ', create_index_res)
#
# # # 2、es中插入数据
# file = '/home/liuxin/work/AI_planner/data/非AI工具/商务资源SOP管理表es.json'
# data_list = es.read_dict( file) # 18169
# es.insert_doc_batch( index_name, data_list)
# res = es.get_doc(index_name, 18128)
# print(res)
#
# # 3、查看es中有多少条数据
# count = es.es.count(index_name)
# print(f'{index_name} :{count} ') #
###### es 搜索 非AI资源数据
es_search = ESsearch()
input = {
"bookCategory": "童书",
"subjectCategory": "语文",
"keyword": "查询方便、词库齐全"
}
hits = es_search.search(input)
# hits = es_search.search_no_ai_source(input)
hits = json.dumps(hits, ensure_ascii=False, indent=4)
print(hits)
print('\nfinished.')
\ No newline at end of file
#!/usr/bin/env python3
import asyncio
import base64
import argparse
from playwright.async_api import async_playwright
from fastapi.responses import JSONResponse
import time
import re, json
import uuid
import uvicorn
from fastapi import FastAPI, Query, HTTPException
from pydantic import BaseModel, HttpUrl
from typing import Optional
import requests
import os
import mimetypes
from typing import Dict, Optional, Union, Tuple
from urllib.parse import quote
# from screenshot import capture_screenshot
async def capture_screenshot(url, width=1280, height=800, save_path=None):
"""
Capture a screenshot of a webpage and return as base64 encoded string.
Args:
url (str): The URL to capture
width (int): Viewport width
height (int): Viewport height
Returns:
str: Base64 encoded screenshot data
"""
timestamp = time.time()
timestamp = str(timestamp)
timestamp = re.sub(r"\.", "_", timestamp)
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page(viewport={'width': width, 'height': height})
try:
await page.goto(url, wait_until='networkidle')
except Exception as e:
await page.goto(url, wait_until='load')
screenshot_bytes = await page.screenshot(full_page=True, path=save_path)
await browser.close()
# Convert to base64
base64_screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
# base64_screenshot = screenshot_bytes
# 保存图片
if save_path:
with open(save_path, 'wb') as f:
f.write(screenshot_bytes)
# print(f"Screenshot saved to {args.output}")
return base64_screenshot
app = FastAPI(title="Screenshot Service")
class Input(BaseModel):
url: str = ""
width: int = 1280
height: int = 800
class ScreenshotResponse(BaseModel):
url: str
base64_image: str
width: int
height: int
class OBSUploader:
def __init__(self, base_url: str = "https://open.raysgo.com", auth_token: Optional[str] = None):
"""
Initialize the OBS uploader.
Args:
base_url: The base URL for the API
auth_token: The authorization token for API access
"""
self.base_url = base_url.rstrip('/')
self.auth_token = auth_token
self.headers = {
'Authorization': f'Bearer {auth_token}' if auth_token else None
}
# Initialize mimetypes
mimetypes.init()
def _get_content_type(self, file_path: Union[str, bytes]) -> Tuple[str, bytes]:
"""
Get content type and file content from file path or bytes.
Args:
file_path: Path to the file or file content as bytes
Returns:
Tuple of (content_type, file_content)
"""
if isinstance(file_path, str):
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
content_type, _ = mimetypes.guess_type(file_path)
with open(file_path, 'rb') as f:
file_content = f.read()
else:
file_content = file_path
# For bytes input, try to detect type from first few bytes
content_type = 'application/octet-stream' # Default content type
return content_type or 'application/octet-stream', file_content
def get_upload_url(self, biz_code: str, object_name: str, content_type: str) -> Dict:
"""
Get a temporary upload URL for the specified object.
Args:
biz_code: Business code for the upload
object_name: Name/path of the object to upload
content_type: MIME type of the file
Returns:
Dict containing the upload URL and related information
"""
endpoint = f"{self.base_url}/aimodel/v1.0/obs/getCreatePostSignature"
params = {
'bizCode': biz_code,
'objectName': object_name,
'mimeType': content_type
}
response = requests.get(endpoint, params=params, headers=self.headers)
response.raise_for_status()
return response.json()
def upload_file(self, file_path: Union[str, bytes], biz_code: str, object_name: str) -> Dict:
"""
Upload a file using temporary credentials.
Args:
file_path: Path to the file to upload or file content as bytes
biz_code: Business code for the upload
object_name: Name/path of the object to upload
Returns:
Dict containing the upload result and file URL
"""
# Get content type and file content
content_type, file_content = self._get_content_type(file_path)
# Get temporary upload URL with content type
upload_info = self.get_upload_url(biz_code, object_name, content_type)
if upload_info['errCode'] != 0:
raise Exception(f"Failed to get upload URL: {upload_info['message']}")
upload_url = upload_info['data']['temporarySignatureUrl']
# Upload the file with the correct content type
headers = {
'Content-Type': content_type,
'Content-Length': str(len(file_content))
}
response = requests.put(upload_url, data=file_content, headers=headers)
response.raise_for_status()
return {
'success': True,
'file_url': upload_info['data']['domain'] + '/' + object_name,
'object_url_map': upload_info['data']['objectUrlMap']
}
@app.post("/screenshot/")
async def get_screenshot(input: Input):
save_images_path = "images"
if not os.path.exists(save_images_path):
# 创建单级或多级目录(自动处理父目录)
os.makedirs(save_images_path)
file_md5 = uuid.uuid4().hex
try:
base64_image = await capture_screenshot(input.url, width=input.width, height=input.height)
res = {}
# 保存base64图片
base64_image = base64.b64decode(base64_image)
save_file = f"screenshot_{file_md5}.jpg"
save_file = os.path.join(save_images_path, save_file)
with open(save_file, "wb") as f:
f.write(base64_image)
uploader = OBSUploader(auth_token="dcg-4c1e3a7f4fcd415e8c93151ff539d20a")
# Upload a file 上传图片方便浏览器查看
try:
result = uploader.upload_file(
file_path=save_file,
biz_code="test",
object_name=f"screenshot/{uuid.uuid4().hex}.jpg"
)
print(f"File uploaded successfully! URL: {result['file_url']}")
res["obs_url"] = result["file_url"]
except Exception as e:
print(f"Upload failed: {str(e)}")
res["obs_url"] = ''
res = JSONResponse(status_code=200, content=res)
except Exception as e:
res["obs_url"] = ''
res = JSONResponse(status_code=500, content={"message": str(e)})
return res
@app.get("/")
async def root():
return {
"service": "Screenshot Service",
"usage": "GET /screenshot?url=https://example.com&width=1280&height=800"
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=19801)
# 对指定网页进行截屏
# 部署的服务器 116.63.110.220
# sudo docker run -itd --name playwright -p 19801:19801 -v /home/liuxin/work:/home/work playwright:v1.2 /bin/bash
# sudo docker exec -it playwright bash
# cd /home/work/AI_planner/screenshot
# nohup python -u screenshot_service.py > screenshot_service.log 2>&1 & # 启动服务
\ No newline at end of file
import requests
import json
from urllib.parse import quote
import base64
from PIL import Image
import io
# API 端点 URL(请替换为实际接口地址)
# api_url = "http://116.63.110.220:19801/screenshot"
api_url = "http://localhost:19801/screenshot"
# 请求参数
url = "https://www.icourse163.org/course/WUST-1206144803?from=searchPage&outVendor=zw_mooc_pcssjg_"
url = "https://www.baidu.com/"
params = {
"url": url, # 需要截图的目标 URL,
"width": 1290, # 指定宽度
"height": 700 # 指定高度
}
params = json.dumps(params, ensure_ascii=False)
try:
# 发送 GET 请求
response = requests.post(api_url, data=params)
# 检查响应状态码
if response.status_code == 200:
data = response.json()
obs_url = data['obs_url']
print("目标网站截屏保存地址:", obs_url)
else:
print("error")
except Exception as e:
print(f"发生异常: {str(e)}")
\ No newline at end of file
#!/usr/bin/env python3
import asyncio
import base64
import argparse
from playwright.async_api import async_playwright
from fastapi.responses import JSONResponse
import time
import re, json
import uvicorn
from fastapi import FastAPI, Query, HTTPException
from pydantic import BaseModel, HttpUrl
from typing import Optional
# from screenshot import capture_screenshot
async def capture_screenshot(url, width=1280, height=800, save_path=None):
"""
Capture a screenshot of a webpage and return as base64 encoded string.
Args:
url (str): The URL to capture
width (int): Viewport width
height (int): Viewport height
Returns:
str: Base64 encoded screenshot data
"""
timestamp = time.time()
timestamp = str(timestamp)
timestamp = re.sub(r"\.", "_", timestamp)
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page(viewport={'width': width, 'height': height})
try:
await page.goto(url, wait_until='networkidle')
except Exception as e:
await page.goto(url, wait_until='load')
screenshot_bytes = await page.screenshot(full_page=True, path=save_path)
await browser.close()
# Convert to base64
base64_screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
# base64_screenshot = screenshot_bytes
# 保存图片
if save_path:
with open(save_path, 'wb') as f:
f.write(screenshot_bytes)
# print(f"Screenshot saved to {args.output}")
return base64_screenshot
app = FastAPI(title="Screenshot Service")
class Input(BaseModel):
url: str = ""
width: int = 1280
height: int = 800
class ScreenshotResponse(BaseModel):
url: str
base64_image: str
width: int
height: int
@app.post("/screenshot/")
async def get_screenshot(input: Input):
try:
input = json.loads(input.json())
url = input['url']
width = input['width']
height = input['height']
base64_image = await capture_screenshot(url, width=width, height=height)
res = {"base64_image":base64_image, "url":url, "width":width, "height":height}
res = JSONResponse(status_code=200, content=res)
except Exception as e:
res = JSONResponse(status_code=500, content={"message": str(e)} )
return res
@app.get("/")
async def root():
return {
"service": "Screenshot Service",
"usage": "GET /screenshot?url=https://example.com&width=1280&height=800"
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=19801)
# 对指定网页进行截屏
# sudo docker run -itd --name playwright -p 19801:19801 -v /home/liuxin/work:/home/work playwright:v1.1 /bin/bash
# sudo docker exec -it playwright bash
# cd /home/work/AI_planner/screenshot
# python web_server.py # 启动服务
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment