import asyncio
import uvicorn
import threading
import json
import time
import sys
import os
import json
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from loguru import logger
from utils.common import get_millisecond_time,save_logs_to_file
import requests
import re
from pydantic import BaseModel
from config.config import DATA_LOGS_DATA,CHECK_IMAGE_CAPTION,CHECK_IMAGE_CAPTION_VLM,CHECK_IMAGE_CONTEXT,CHECK_IMAGE_CONTEXT_VLM
from config.config import VLM_Match_User_Prompt,VLM_Match_Context_User_Prompt
from datetime import datetime
from tasks.qwen_vl_infer import qwen_vl_infer

# 移除默认的日志处理器
logger.remove()

# 添加新的日志处理器，按天生成新日志文件
logger.add(
    sink="/nfs/liuxin/work/Image_TextTitle_Matching/Image_Text_Matching_Server_Pro/logs/image_text_match_{time:YYYY-MM-DD}.log",  # 日志文件按日期命名
    format="{time:YYYY-MM-DD HH:mm:ss} {process} {level} {module}:{function}:{line}: {message}",
    rotation="1 day",  # 每天创建一个新文件
    encoding="utf-8",  # 日志文件编码
    enqueue=True,      # 异步写入
    retention="90 days"  # 保留最近 90 天的日志文件
)

app = FastAPI()

class Item(BaseModel):
    illustration_url:str
    caption_text:str = ""
    context_info:str = ""
    

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.get("/")
def read_root():
    logger.info("Root endpoint was accessed")
    return {"Hello": "World"}

@app.get('/health')
def health():
    return "OK"


## 不调用版面检测接口
@app.post("/v1/image_text_matching")
def formula_parse_by_tex(item:Item):
    logger.info('in image_text_matching')

    #logger.info(f'item.illustration_url={item.illustration_url}\nitem.caption_text={item.caption_text}\ncontext_info={item.context_info}')
    try:
        #内容存储
        content = {"result":[]}

        #数据飞轮
        logs_info={'url':'',
                   'image_path':'',
                   'caption_text':'',
                   'context_info':'',
                   'match_context_info':'',
                   'caption_check':'',
                   'context_check':''
                   }
        
        if item.illustration_url is not None and len(item.illustration_url.strip())<=0:
            logger.error("requests url is None: {0}")
            content['code'] = 400
            content['message'] = 'the url is empty!'
            # content['base64_images'] = []
            # content['result'] = {"error_formula":[],"error_reason":[],"corrected_formula":[]}
            return JSONResponse(content=content)
        
        if len(item.caption_text)<=0 and len(item.context_info)<=0:
            logger.error("caption and context all None: {0}")
            content['code'] = 400
            content['message'] = 'caption and context all Empty:!'
            # content['base64_images'] = []
            # content['result'] = {"error_formula":[],"error_reason":[],"corrected_formula":[]}
            return JSONResponse(content=content)
        
        url = item.illustration_url.strip()
        logs_info['url']=url
        # 保存文件 #这里需要判断是否这个url连接的后缀是否包含特定元素
        task_id = get_millisecond_time()
        suffix=None
        apart_file_url= url.split('/')[-1]

        if '.' in apart_file_url:
            file_name,suffix = url.split('/')[-1].split('.')
        else:
            file_name=apart_file_url
            if suffix==None or suffix not in ['jpg','png','jpeg']:
                suffix='jpg'
                
        #创建日志文件夹
        current_date = datetime.now().strftime("%Y-%m-%d")
        daily_log_dir = os.path.join(DATA_LOGS_DATA, current_date)
        # 创建当天的日志目录
        os.makedirs(daily_log_dir, exist_ok=True)
        #基于每日文件夹创建子文件存储，image,json,sub_img
        json_save_dir=os.path.join(daily_log_dir,'jsons')
        os.makedirs(json_save_dir, exist_ok=True)

        image_save_dir=os.path.join(daily_log_dir,'images')
        os.makedirs(image_save_dir, exist_ok=True)


        #文件名存储
        file_name=file_name+'_'+task_id
        save_json_path=os.path.join(json_save_dir, file_name+'.json')
        #每毫秒一个文件名，防止数据覆盖
        save_file_path= os.path.join(image_save_dir, file_name+'.'+suffix)
        logs_info['image_path']=save_file_path
        #创建存储的文件，只会生效一次

        start = time.time()
        # 通过url下载文件
        proxies = {"http": None, "https": None}  # 绕过代理
        response = requests.get(url, proxies=proxies, stream=True)
        if response.status_code != 200:
            logger.error("下载文件失败: {0}", url)
            content['code'] = 400
            content['message'] = f"下载文件失败: {url}"
            content["result"] = []
            return JSONResponse(content=content)
        
        ## 图像数据存储
        with open(save_file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=10240):
                if chunk:
                    f.write(chunk)
            logger.info(f'图像存储成功，path={save_file_path}')
        
        end = time.time()
        cost = end - start
        logger.info("下载文件：\t{0}\t时间为：{1}秒", url, cost)
        ## 日志飞轮
        logs_info['url']=item.illustration_url
        logs_info['caption_text']=item.caption_text
        logs_info['context_info']=item.context_info
        logs_info['image_path']=save_file_path
        ### 一、图像和标题匹配
        if CHECK_IMAGE_CAPTION and len(item.caption_text)>3:
            error_dict={'context_text':'','error_reson':'',"error_type":'caption'}
            #标题是图像的精炼表达。
            if CHECK_IMAGE_CAPTION_VLM:
                #提示词直接匹配？ 图像理解再匹配？
                qwen_match_response=qwen_vl_infer(item.illustration_url,'你是一个图文匹配判断专家。',VLM_Match_User_Prompt.replace("{{user_text}}",item.caption_text))
                #logger.info(f'qwen_match_response={qwen_match_response}')
                #日志存储
                logs_info['caption_check']=qwen_match_response

                if '匹配结果：' in qwen_match_response and '原因：' in qwen_match_response:
                    response_info=qwen_match_response.split('匹配结果：')[1]
                    
                    is_match = response_info.split('原因：')[0].strip().replace('{','').replace('}','')
                    error_reson=response_info.split('原因：')[1].strip().replace('{','').replace('}','')
                    
                    if is_match=='不匹配':
                        error_dict['context_text']=item.caption_text
                        error_dict['error_reson']=error_reson
                        content['result'].append(error_dict) 
                logger.info(f'in caption check=qwen_match_response={qwen_match_response}\n\timage_url={item.illustration_url}\n\t title_text={item.caption_text}\n\t context_info={item.context_info}')

            else:
                #私有模型方案，e.g.VIT BLIP.......
                pass
        # 当图像标题和正文内容相同
        if  item.context_info==item.caption_text:
            content['code'] = 200
            content['message'] = 'OK'
            content['model'] = 'qwen'
            return JSONResponse(content=content)
        
        ### 二、图像和正文匹配
        #（1）提取图序和图标题内容； （2）拿到局部上下文； （3）图序检索和模糊匹配；(4)检测判断；
        if CHECK_IMAGE_CONTEXT and len(item.context_info)>5 and len(item.caption_text)>3:
            error_dict={'context_text':'','error_reson':'',"error_type":'context'}
            #(1)提取图序    
            image_id=''
            if len(item.caption_text)>3:
                #提取图序

               # 匹配图序：图 K、图K-1、图K-1-2 等，去掉空格
                image_id_match = re.findall(r"图\s*\d+(?:-\d+)*", item['caption_text'])

                # 去掉空格
                if image_id_match:
                    image_id = image_id_match[0].replace(" ", "")  # 输出: 图5 或 图5-7
                else:
                    image_id = ''
                
                logger.info(f'{item.caption_text} image_order is {image_id}')
            

            paragraph_info=''
            if paragraph_info=='':
                #（2）拿到局部上下文，进行搜索（精准匹配re）
                if len(image_id)>0  or (len(item.caption_text)>3 and item.caption_text in item.context_info and len(item.context_info)>5) :
                    if image_id in item.context_info:
                        # 只匹配从 image_id 开始到第一个 '\n' 之前的所有字符
                        pattern_context = rf"{re.escape(image_id)}[^\n]*"
                        search_context = re.search(pattern_context, item.context_info)
                        if search_context:
                            paragraph_info+= search_context.group(0)+'\n'
                
                        logger.info(f'{item.context_info} image_order context is {paragraph_info}')
                    
                    #做图像内容局部匹配
                    #使用图像标题 在 正文内检索局部相关内容
                    pattern = rf'[^。]*{re.escape(item.caption_text)}[^。]*。'

                    match = re.search(pattern, item.context_info)
                    if match:
                        paragraph_info+=match.group(0)+'\n'

                if len(paragraph_info)>5:
                    logs_info['match_context_info']=paragraph_info

                    #标题是图像的精炼表达。
                    if CHECK_IMAGE_CONTEXT_VLM:
                        #提示词直接匹配？ 图像理解再匹配？
                        qwen_match_response=qwen_vl_infer(item.illustration_url,'你是一个图文匹配判断专家。',VLM_Match_Context_User_Prompt.replace("{{user_text}}",paragraph_info).replace("{{caption}}",item.caption_text))
                        #日志存储
                        logs_info['context_check']=qwen_match_response
                        #logger.info(f'qwen_match_response={qwen_match_response}')
                        if '匹配结果：' in qwen_match_response and '原因：' in qwen_match_response:
                            response_info=qwen_match_response.split('匹配结果：')[1]
                            
                            is_match = response_info.split('原因：')[0].strip().replace('{','').replace('}','')
                            error_reson= response_info.split('原因：')[1].strip().replace('{','').replace('}','')
                            # logger.info(f'is_match={is_match}',error_reson={error_reson})
                            if is_match=='不匹配':
                                #error_dict['full_context_text']
                                error_dict['context_text']=item.context_info
                                error_dict['error_reson']=error_reson
                                content['result'].append(error_dict)
                        logger.info(f'in_context_check,have caption,=qwen_match_response={qwen_match_response}\n\timage_url={item.illustration_url}\n\t title_text={item.caption_text}\n\t context_info={item.context_info}')
                # else:
                    logger.info(f'')
            else:
                error_dict={'context_text':'','error_reson':'',"error_type":'context'}
                #没有caption
                #标题是图像的精炼表达。
                if CHECK_IMAGE_CONTEXT_VLM:
                    logger.info('context_check')
                    #提示词直接匹配？ 图像理解再匹配？
                    qwen_match_response=qwen_vl_infer(item.illustration_url,'你是一个图文匹配判断专家。',VLM_Match_Context_User_Prompt.replace("{{user_text}}",item.context_info).replace("{{caption}}",'用户没有提供图像标题，仅需关注正文。'))
                    #日志存储
                    logger.info(f'qwen_match_response={qwen_match_response}')
                    logs_info['context_check']=qwen_match_response
                    #logger.info(f'qwen_match_response={qwen_match_response}')
                    if '匹配结果：' in qwen_match_response and '原因：' in qwen_match_response:
                        response_info=qwen_match_response.split('匹配结果：')[1]
                        
                        is_match = response_info.split('原因：')[0].strip().replace('{','').replace('}','')
                        error_reson= response_info.split('原因：')[1].strip().replace('{','').replace('}','')
                        if is_match =='不匹配':
                            #error_dict['full_context_text']
                            error_dict['context_text']=item.context_info
                            error_dict['error_reson']=error_reson
                            content['result'].append(error_dict)
                    logger.info(f'in_context_check，no_caption,=qwen_match_response={qwen_match_response}\n\timage_url={item.illustration_url}\n\t title_text={item.caption_text}\n\t context_info={item.context_info}')

            #pass
        elif CHECK_IMAGE_CONTEXT:
            error_dict={'context_text':'','error_reson':'',"error_type":'context'}
            #没有caption
            #标题是图像的精炼表达。
            if CHECK_IMAGE_CONTEXT_VLM:
                logger.info('context_check')
                #提示词直接匹配？ 图像理解再匹配？
                qwen_match_response=qwen_vl_infer(item.illustration_url,'你是一个图文匹配判断专家。',VLM_Match_Context_User_Prompt.replace("{{user_text}}",item.context_info).replace("{{caption}}",'用户没有提供图像标题，仅需关注正文。'))
                #日志存储
                logger.info(f'qwen_match_response={qwen_match_response}')
                logs_info['context_check']=qwen_match_response
                #logger.info(f'qwen_match_response={qwen_match_response}')
                if '匹配结果：' in qwen_match_response and '原因：' in qwen_match_response:
                    response_info=qwen_match_response.split('匹配结果：')[1]
                    
                    is_match = response_info.split('原因：')[0].strip().replace('{','').replace('}','')
                    error_reson= response_info.split('原因：')[1].strip().replace('{','').replace('}','')
                    if is_match =='不匹配':
                        #error_dict['full_context_text']
                        error_dict['context_text']=item.context_info
                        error_dict['error_reson']=error_reson
                        content['result'].append(error_dict)
                logger.info(f'in_context_check，no_caption,=qwen_match_response={qwen_match_response}\n\timage_url={item.illustration_url}\n\t title_text={item.caption_text}\n\t context_info={item.context_info}')

            #pass
        
        ## 存储日志飞轮
        save_logs_to_file(logs_info,file_name=save_json_path)


        content['code'] = 200
        content['message'] = 'OK'
        content['model'] = 'qwen'

        return JSONResponse(content=content)
    
    except Exception as e:
        content['code'] = 500
        content['message'] = 'Server Exception'
        content['result'] = []
        return JSONResponse(content=content)

## 不调用版面检测接口
@app.post("/v1/image_text_matching_local_file")
def formula_parse_by_tex(item:Item):
    logger.info('in image_text_matching_local_file')

    #logger.info(f'item.illustration_url={item.illustration_url}\nitem.caption_text={item.caption_text}\ncontext_info={item.context_info}')
    try:
        #内容存储
        content = {"result":[]}

        #数据飞轮
        logs_info={'url':'',
                   'image_path':'',
                   'sub_image_path':[],
                   'markdown_info':'',
                   'markdown_filter':'',
                   'formula_check':'',
                   'formula_check_postProcess':'',
                   'image_info':'',
                   'return_results':''
                   }
        
        if item.illustration_url is not None and len(item.illustration_url.strip())<=0:
            logger.error("requests url is None: {0}")
            content['code'] = 400
            content['message'] = 'the url is empty!'
            # content['base64_images'] = []
            # content['result'] = {"error_formula":[],"error_reason":[],"corrected_formula":[]}
            return JSONResponse(content=content)
        
        if len(item.caption_text)<=0 and len(item.context_info)<=0:
            logger.error("caption and context all None: {0}")
            content['code'] = 400
            content['message'] = 'caption and context all Empty:!'
            # content['base64_images'] = []
            # content['result'] = {"error_formula":[],"error_reason":[],"corrected_formula":[]}
            return JSONResponse(content=content)
        
        # url = item.illustration_url.strip()
        # logs_info['url']=url
        # # 保存文件 #这里需要判断是否这个url连接的后缀是否包含特定元素
        # task_id = get_millisecond_time()
        # suffix=None
        # apart_file_url= url.split('/')[-1]

        # # if '.' in apart_file_url:
        # #     file_name,suffix = url.split('/')[-1].split('.')
        # # else:
        # #     file_name=apart_file_url
        # #     if suffix==None or suffix not in ['jpg','png','jpeg']:
        # #         suffix='jpg'
                
        # # #创建日志文件夹
        # # current_date = datetime.now().strftime("%Y-%m-%d")
        # # daily_log_dir = os.path.join(DATA_LOGS_DATA, current_date)
        # # # 创建当天的日志目录
        # # os.makedirs(daily_log_dir, exist_ok=True)
        # # #基于每日文件夹创建子文件存储，image,json,sub_img
        # # json_save_dir=os.path.join(daily_log_dir,'jsons')
        # # os.makedirs(json_save_dir, exist_ok=True)

        # # image_save_dir=os.path.join(daily_log_dir,'images')
        # # os.makedirs(image_save_dir, exist_ok=True)


        # # #文件名存储
        # # file_name=file_name+'_'+task_id
        # # save_json_path=os.path.join(json_save_dir, file_name+'.json')
        # # #每毫秒一个文件名，防止数据覆盖
        # # save_file_path= os.path.join(image_save_dir, file_name+'.'+suffix)
        # # logs_info['image_path']=save_file_path
        # # #创建存储的文件，只会生效一次

        # # start = time.time()
        # # # 通过url下载文件
        # # proxies = {"http": None, "https": None}  # 绕过代理
        # # response = requests.get(url, proxies=proxies, stream=True)
        # # if response.status_code != 200:
        # #     logger.error("下载文件失败: {0}", url)
        # #     content['code'] = 400
        # #     content['message'] = f"下载文件失败: {url}"
        # #     content["result"] = []
        # #     return JSONResponse(content=content)
        
        # # ## 图像数据存储
        # # with open(save_file_path, 'wb') as f:
        # #     for chunk in response.iter_content(chunk_size=10240):
        # #         if chunk:
        # #             f.write(chunk)
        # #     logger.info(f'图像存储成功，path={save_file_path}')
        
        end = time.time()
        # cost = end - start
        # logger.info("下载文件：\t{0}\t时间为：{1}秒", url, cost)
        ### 图像和标题匹配
        if CHECK_IMAGE_CAPTION:
            error_dict={'context_text':'','error_reson':'',"error_type":'caption'}
            #标题是图像的精炼表达。
            if CHECK_IMAGE_CAPTION_VLM:
                #提示词直接匹配？ 图像理解再匹配？
                qwen_match_response=qwen_vl_infer(item.illustration_url,'你是一个图文匹配判断专家。',VLM_Match_User_Prompt.replace("{{user_text}}",item.caption_text))
                logger.info(f'qwen_match_response={qwen_match_response}')
                if '匹配结果：' in qwen_match_response and '原因：' in qwen_match_response:
                    response_info=qwen_match_response.split('匹配结果：')[1]
                    
                    is_match = response_info.split('原因：')[0].strip()
                    error_reson=response_info.split('原因：')[1].strip()

                    if is_match=='不匹配':
                        error_dict['context_text']=item.caption_text
                        error_dict['error_reson']=error_reson
                        content['result'].append(error_dict)
        
            else:
                #私有模型方案，e.g.VIT BLIP.......
                pass
        ### 图像和正文匹配

        content['code'] = 200
        content['message'] = 'OK'
        content['model'] = 'qwen'

        
       

        return JSONResponse(content=content)
    
    except Exception as e:
        content['code'] = 500
        content['message'] = 'Server Exception'
        content['result'] = []
        return JSONResponse(content=content)

if __name__=="__main__":
    
    logger.info("image text matching check start successful!")

    uvicorn.run(app=f'main:app', host="0.0.0.0", port=29500, reload=False, workers=10)  # 提供后端的接口 http://61.170.32.13:29500/v1/image_text_matching
    # uvicorn.run(app=f'main:app', host="0.0.0.0", port=29501, reload=False, workers=3)  # 个人测试测试

    # cd /nfs/liuxin/work/Image_TextTitle_Matching
    # conda activate text_check
    # nohup python -u main.py > main_image_title.log 2>&1 &
    # tail -f main_image_title.log
    # ss -ntlp | grep 29500

