SadhulaSaiKumar
/
AI


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175
							from flask import Flask, render_template, request, redirect, Response, send_file
import os
# import openai
import requests
import pandas as pd
import pgeocode
from email_scraper import scrape_emails
import phonenumbers
from pdfminer.high_level import extract_text
import pytesseract
import time
import multiprocessing
from PIL import Image
from functools import partial
from urlextract import URLExtract
import pytesseract as tess
from PIL import Image
# from doctr.io import DocumentFile
# from doctr.models import ocr_predictor
# model = ocr_predictor(pretrained=True)
# load tagger
######################################################
import os
import glob

from pytesseract import *
import shutil
import cv2
import matplotlib
from werkzeug.utils import secure_filename
import requests
import spacy
import time
import multiprocessing
from PIL import Image
from functools import partial
nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME")
nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2")
from flask import Flask, render_template, request, redirect, Response, send_file

import pandas as pd
################################################################
Current_Working_Directory=os.getcwd()
Current_Working_Directory=Current_Working_Directory.replace("\\","/")
nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")

################################################################
# import spacy

# nlp_model1 = spacy.load('./ADD3001.2')
from flair.data import Sentence
from flair.models import SequenceTagger
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")

from paddleocr import PaddleOCR, draw_ocr

ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False)
tagger = SequenceTagger.load("flair/ner-english-large")

import datetime

app = Flask(__name__)


# app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"


@app.route('/', methods=['GET'])
def home():
    return render_template('home.html')


@app.route('/resume', methods=['GET'])
def resume():
    return render_template('resume.html')


@app.route('/invoice', methods=['GET'])
def invoice():
    return render_template('invoice.html')


@app.route('/card', methods=['GET'])
def card():
    return render_template('card.html')


@app.route('/upload_BusinessCards', methods=["POST"])
# @app.route('/multiplecards', methods=["POST"])
def multiplecards():
    # print('################## multiple card detection #######################')
    # print(Dataset)
    datalist=[]
    zlist=[]
    Dataset = request.get_json()
    # print(data)
    #datalist.append(Dataset)
    data = {'visiting': Dataset}
    for i in data['visiting']:
        import time
        # time.sleep(1)
        a = i
        x = a['FileData']
        # print(x)
        y = a['FileName']
        z = a['FileType']
        # CreatedBy=a['CreatedBy']

        name = y + '.' + z
        # print(name)
        # print(y)
        # image = y.split("/")
        # filename=image[-1]

        # print(x)
        img_data = x.encode()

        import base64
        with open('./multicards/' + name, "wb") as fh:
            fh.write(base64.decodebytes(img_data))
        # print(i)

        # import os
        # import glob
        # for i in glob.glob('./multipleupload/*'):

        found = './multicards/' + name
        print(found)
        extension = found.split('.')[-1]

        # for root, dirs, fils in os.glob('./multipleupload'):
        #     for name in files:
        #         foundfile= os.path.join(root, name)
        #         print(foundfile)

        import re
        import csv
        import glob
        import os
        # import pytesseract
        # import cv2
        import numpy as np
        import glob
        import os
        import cv2
        import requests
        final = []
        # final.append('assignto--'+CreatedBy)
        imagelist = []
        # print(found)
        remove_list = []
        import os
        import glob
        import pdfminer

        # import os
        # ts = 0
        # for file_name in glob.glob('./upload/*'):
        #     fts = os.path.getmtime(file_name)
        #     if fts > ts:
        #         ts = fts
        #         found = file_name
        # print(found)

        # print(extension)

        def org_name():
            print('org_name is working')
            import pytesseract
            fname = found
            if extension != 'pdf':

                img = cv2.imread(fname)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

                cv2.imwrite(str(found), img)
                from PIL import Image
                im = Image.open(found)
                im.save("images1.png", dpi=(1200, 1200))
                # import pytesseract
                fname = "images1.png"
                import pytesseract as tess
                from PIL import Image

                tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
                pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
                with open("demo.pdf", "w+b", ) as f:
                    f.write(pdf)

                from pdfminer.high_level import extract_text
                text = extract_text('demo.pdf')
                # doc = DocumentFile.from_images(found)
                # result = model(doc)
                # text = result.render()

                # from pdfminer.high_level import extract_text
                # txt  = extract_text('demo.pdf')
            else:
                from pdfminer.high_level import extract_text
                text = extract_text(fname)

            sentence = Sentence(text)

            # predict NER tags
            tagger.predict(sentence)

            # print sentence
            ko = (sentence)

            ko1 = str(ko).split("→")
            import pandas as pd

            dfg = []
            try:
                s = ko1[1].replace("", "").replace("", "").replace("/", ":")

                # os.remove(found)
                # return 'Invalid image'
                dfg.append(s)
                df = pd.DataFrame(dfg)
                df = df[0]

                df.to_csv("df.csv", index=False)

                df1 = pd.read_csv("df.csv")
                ve = df1["0"].str.split(",")
                fgf = ve.to_list()
                dfgh = pd.DataFrame(fgf[0])
                maindf = dfgh[0]  # .str.split(":")
                # maindf.to_csv("main.csv")

                main1 = maindf.to_list()
                main1
                # cv=pd.DataFrame(ve)
                # cv
                per = ["PER"]
                org = ["ORG"]
                loc = ["LOC"]
                organizations = [i for i in main1 for j in org if j in i]
                PErsons = [i for i in main1 for j in per if j in i]
                location = [i for i in main1 for j in loc if j in i]
            except IndexError:
                pass

                # ************************************* ORGANIZATION ********************************************************************

        def organisation():
            print('organisation working ')
            try:
                if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
                                                                                                             '').replace(
                    '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
                                                                                                         '').replace(
                    '.com', ''))) < 4:
                    pass


                else:

                    match = str(urlfinal[0]).lower()
                    match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
                        'https',
                        '').replace(
                        'http', '').replace(":", "").replace("/", "").upper()
                    print(match)

                    s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
                                                                                                         '') + " /" + \
                          organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
                    s1 = s1g.upper()
                    s2 = match.upper()
                    from difflib import SequenceMatcher
                    print(s1)
                    print(s2)
                    print(SequenceMatcher(None, s1, s2).ratio())
                    if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
                        # and SequenceMatcher(None, s1, s2).ratio()<0.50:
                        final.append(
                            "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
                                                                                                                 '').replace(
                                '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
                                                                                                                   '').replace(
                                '.com',
                                '').replace(']', ''))
                    else:
                        final.append("OrganizationName--" + s2)

            except IndexError:
                try:
                    if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
                                                                                                                 '').replace(
                        '"',
                        '').replace(
                        '.com', '').replace('.in', ''))) < 4:
                        pass

                    else:
                        match = str(urlfinal[0]).lower()
                        match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
                                                                                                         '').replace(
                            'https', '').replace('http', '').replace(":", "").replace("/", "").upper()

                        s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
                        s1 = s1g.upper()
                        s2 = match.upper()
                        from difflib import SequenceMatcher
                        print(s1)
                        print(s2)
                        print(SequenceMatcher(None, s1, s2).ratio())
                        if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
                            # and SequenceMatcher(None, s1, s2).ratio()<0.50:
                            final.append(
                                "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
                                    '[',
                                    '').replace(
                                    ']', '').replace(
                                    '.com', ''))
                        else:
                            final.append("OrganizationName--" + s2)

                except IndexError:
                    try:
                        match = str(urlfinal[0]).lower()
                        match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
                                                                                                         '').upper()
                        final.append("OrganizationName--" + match)
                        # remove_list.append(match)
                    except IndexError:
                        company()

        #################################################company Name########################################

        def company():
            print('company list working')
            import re

            new = []
            with open('test.txt', 'r+') as f:
                flag = False
                for line in f:
                    line = line.upper()
                    matches = re.findall(
                        r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
                        line)

                    for i in matches:
                        if i in line:
                            flag = True
                            if flag:
                                o = "OrganizationName--" + line
                                new.append(o)
            #                       if line.startswith('\n'):
            #                           flag = False
            try:
                a = new[0].replace('\n', '')
                final.append(a)
            except IndexError:
                final.append("OrganizationName--")

        # ************************************* CONTACT PERSON *******************************************************************
        def contactpersonname():
            print('contactpersonname working')
            try:
                final.append(
                    "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
                        "]",
                        "") + '/' +
                    PErsons[
                        1].replace(":PER", "").replace('"', ''))
            except IndexError:
                try:
                    final.append(
                        "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
                                                                                                        "").replace(
                            '"', ''))
                except IndexError:
                    final.append("CONTACTPERSONNAME--")

        def image_to_text():

            # doc = DocumentFile.from_images(found)
            # result = model(doc)
            # image_to_text.txt = result.render()

            # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
            # img = Image.open(found)
            # text = tess.image_to_string(img)
            # image_to_text.txt = text
            # print(text)
            import cv2
            img_path = found
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            cv2.imwrite(str(found), img)

            result = ocr.ocr(img_path, cls=True)
            result = result[0]

            txts = [line[1][0] for line in result]

            image_to_text.txt = ""
            for i in txts:
                if len(i) < 4:
                    continue
                    # print(i+"\n")
                image_to_text.txt = image_to_text.txt + str(i) + "\n"
                # print(image_to_text.txt)

        def pdf_to_text():

            from pdfminer.high_level import extract_text
            pdf_to_text.txt = extract_text(found)
            # pdf_to_text.txt= text.replace('\n', ' ')

        extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']

        if extension in extensionlist:
            print('image' + extension)
            image_to_text()
            x = image_to_text.txt

        else:
            print('pdf' + extension)
            pdf_to_text()
            x = pdf_to_text.txt

        verticaltext = x
        htext = x
        # print('------------------------------------------------')
        #print('############################################################# this is verticaltext #################################################################')
        print(verticaltext)
        htext = htext.replace('\n', ' ')
       # print('############################################################# this is htext #############################################################')
        #print(htext)
        y = x.replace('\n', ',')
        y = y.replace('  ', ' ')
        # y = y.replace(".", " .")
        horizontaltext = y
        # print('------------------------------------------------')
        #print('############################################################# this is horizontaltext #############################################################')
        #print(horizontaltext)

        textfile = open("test123456.txt", "w")
        a = textfile.write(verticaltext)
        textfile.close()
        textfile = open("vtext.txt", "w")
        a = textfile.write(horizontaltext)
        textfile.close()
        with open('test123456.txt', 'r') as f:
            with open('test.txt', 'w') as w:
                for line in f:
                    if line.strip().replace('|', ''):
                        w.write(line)

        ###########################ADDRESS##################################
        addrespinlst = []

        def splitaddress():
            import re
            textaddress = htext.replace('\n', ' ')
            # print(textaddress)

            address1 = (textaddress.partition(",")[0])
            words = address1.split()
            address1 = words[-1]
            addre = (htext.partition(",")[2])
            a = addre.replace('\n', ' ').replace('\x0c', '')
            addre = (a.partition(",")[2])
            matches = re.findall(
                r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3}  \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
                a)
            for match in matches:
                address2 = match
                address2 = str(address2)
                address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace('  ',
                                                                                                                  '')

            matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3}  \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
            for address3 in matches:
                pass
            try:
                Address = address1 + "," + address2 + "," + address3
                final.append('ADDRESS--' + Address)
                addrespinlst.append(Address)

            except NameError:
                final.append('ADDRESS--')


                #print('############################################################ Addressmodelworking #############################################################')

                # doc = nlp_model1(textaddress)
                # addlist = []
                # for ent in doc.ents:
                #     name = (f'{ent.label_.upper():{10}}--{ent.text}')
                #     addlist.append(name)
                # try:
                #     Address = addlist[0]
                #     final.append(Address)
                #     addrespinlst.append(Address)
                #     remove_list.append(
                #         str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
                #             "ADDRESS--",
                #             ""))
                # except IndexError:
                #     final.append("ADDRESS--")
                pass

        ################################################## website#######################################################

        # import re

        # url = []
        # matches = re.findall(r'www.*', verticaltext)
        # for match in matches:
        #     if (match.count('.')) == 1:
        #         a_string1 = match.replace("www", "www.")

        #         final.append("Urls--" + a_string1)
        #         url.append(a_string1)
        #     else:

        #         final.append("Urls--" + match)

        # if len(url)==0:

        #     from urlextract import URLExtract

        #     extractor = URLExtract()
        #     urls = extractor.find_urls(verticaltext)
        #     try:
        #         urllist = urls[0]
        #         final.append("Urls--"+urllist)
        #         url.append(urllist)
        #     except IndexError:
        #         final.append("Urls--")

        #     for match in matches:
        #         if (match.count('.')) == 1:
        #             a_string1 = match.replace("www", "www.")

        #             final.append("Urls--" + a_string1)
        #             url.append(a_string1)
        #         else:

        #             final.append("Urls--" + match)
        #             url.append(match)
        #             remove_list.append(match)
        # else:
        #     final.append("Urls--" )

        ################################################## website#######################################################

        import re
        # final=[]
        url = []
        urlfinal = []
        matches = re.findall(r'www.*', verticaltext)
        for match in matches:

            if (match.count('.')) == 1:
                a_string1 = match.replace("www", "www.")

                # final.append("Urls--" + a_string1)
                url.append(a_string1)
            else:

                url.append(match)

        if len(url) == 0:

            from urlextract import URLExtract

            extractor = URLExtract()
            urls = extractor.find_urls(verticaltext)
            try:
                urllist = urls[0]
                url.append(urllist)
                url.append(urllist)
            except IndexError:
                pass

            for match in matches:
                if (match.count('.')) == 1:
                    a_string1 = match.replace("www", "www.")

                    url.append(a_string1)
                    # url.append(a_string1)
                else:

                    url.append(match)
                    url.append(match)

        else:
            pass
        try:
            test_string = url[0]

            test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]

            res = [ele for ele in test_list if (ele in test_string)]

            if len(res) == 0:
                print('no match')

                final.append('urls--')


            else:
                print('matched')
                final.append('urls--' + url[0])
                urlfinal.append(url[0])


        except IndexError:
            final.append('urls--')

        print(
            '############################################################# url #############################################################')
        print(url)
        #######organisation and contact################

        # def company_url():
        #     # print('--url--')
        #     # print(url)

        #     try:
        #         match = str(url[0]).lower()
        #         match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
        #         final.append("OrganizationName--" + match)
        #         # remove_list.append(match)
        #     except IndexError:
        #         org_name()
        #         organisation()
        # final.append("OrganizationName--")

        # make example sentence

        # print(horizontaltext)
        sentence = Sentence(verticaltext)

        # predict NER tags
        tagger.predict(sentence)

        # print sentence
        ko = (sentence)

        ko1 = str(ko).split("→")
        import pandas as pd

        dfg = []
        try:
            s = ko1[1].replace("", "").replace("", "").replace("/", ":")
        except IndexError:
            os.remove(found)
            return 'Invalid image'
        dfg.append(s)
        df = pd.DataFrame(dfg)
        df = df[0]

        df.to_csv("df.csv", index=False)

        df1 = pd.read_csv("df.csv")
        ve = df1["0"].str.split(",")
        fgf = ve.to_list()
        dfgh = pd.DataFrame(fgf[0])
        maindf = dfgh[0]  # .str.split(":")
        # maindf.to_csv("main.csv")

        main1 = maindf.to_list()
        main1
        # cv=pd.DataFrame(ve)
        # cv
        per = ["PER"]
        org = ["ORG"]
        loc = ["LOC"]
        organizations = [i for i in main1 for j in org if j in i]
        PErsons = [i for i in main1 for j in per if j in i]
        location = [i for i in main1 for j in loc if j in i]

        # ************************************* ORGANIZATION ********************************************************************
        try:
            if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
                                                                                                         '').replace(
                ']', '').replace(
                '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
                pass
                # company_url()
            else:

                match = str(urlfinal[0]).lower()
                match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
                    'https',
                    '').replace(
                    'http', '').replace(":", "").replace("/", "").upper()
                print(match)

                s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
                    '.com', '') + " /" + \
                      organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
                s1 = s1g.upper()
                s2 = match.upper()
                from difflib import SequenceMatcher
                print(s1)
                print(s2)
                print(SequenceMatcher(None, s1, s2).ratio())
                if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
                    # and SequenceMatcher(None, s1, s2).ratio()<0.50:
                    final.append(
                        "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
                                                                                                             '').replace(
                            '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
                                                                                                               '').replace(
                            '.com', '').replace(']', ''))
                else:
                    final.append("OrganizationName--" + s2)


        except IndexError:
            try:
                if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
                                                                                                             '').replace(
                    '"',
                    '').replace(
                    '.com', ''))) < 4:
                    pass
                    # company_url()
                else:

                    match = str(urlfinal[0]).lower()
                    match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
                        'https', '').replace('http', '').replace(":", "").replace("/", "").upper()

                    s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
                                                                                                         '').replace(
                        '.com', '')
                    s1 = s1g.upper()
                    s2 = match.upper()
                    from difflib import SequenceMatcher
                    print(s1)
                    print(s2)
                    print(SequenceMatcher(None, s1, s2).ratio())
                    if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
                        # and SequenceMatcher(None, s1, s2).ratio()<0.50:
                        final.append(
                            "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
                                                                                                                 '').replace(
                                ']', '').replace(
                                '.com', '').replace(']', ''))
                    else:
                        final.append("OrganizationName--" + s2)

            except IndexError:
                org_name()
                organisation()

                # final.append("OrganizationName--")

        # ************************************* CONTACT PERSON *******************************************************************
        try:
            final.append(
                "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
                                                                                                                 "") +
                PErsons[
                    1].replace(":PER", "").replace('"', ''))
        except IndexError:
            try:
                final.append(
                    "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
                        '"',
                        ''))
            except IndexError:
                org_name()
                contactpersonname()
                # final.append("CONTACTPERSONNAME--")
        ###############address flair#####################

        try:
            print(
                '############################################################# address new code #############################################################')
            loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
            loclst = [i for i in loactionlst if i in htext.lower()]

            textaddress = htext
            textaddress = textaddress.replace("|", ",")
            textaddress = textaddress.lower()

            nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
            grop = nlp(textaddress)

            citycountry = []
            print('########################### city or country name ###########################')
            d = grop[-1]

            if d['entity_group'] == "COUNTRY":
                print(d["word"])
                citycountry.append(d["word"])
            elif d['entity_group'] == "CITY":
                print(d["word"])
                citycountry.append(d["word"])

            try:
                address1 = loclst[0]
            except IndexError:
                address1 = (textaddress.partition(",")[0])
                words = address1.split()
                address1 = words[-1]

            star_location = address1.lower()
            end_location = citycountry[0].replace("#", "")
            start = star_location
            end = end_location
            s = textaddress.lower()
            middle_address = (s.split(start))[-1].split(end)[0]
            Address = start + middle_address + end
            Address = Address.replace('--', '').title()
            print(Address)
            if Address.count(',') < 2:
                splitaddress()
            else:
                final.append('ADDRESS--' + Address)

            # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
            # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
            # d1 = star_location.split()
            # d2 = end_location.split()
            # d3 = d1[0]
            # d4 = d2[0]
            # start = d3
            # end = d4
            # s = horizontaltext
            # middle_address = ((s.split(start))[1].split(end)[0])
            # Address = d3 + middle_address + d4
            # final.append('ADDRESS--' + Address)
            # addrespinlst.append(Address)


        except IndexError:
            splitaddress()

        ########################################## Designation ###########################################
        import re
        new = []
        with open('test.txt', 'r') as f:
            flag = False
            for line in f:
                line1 = line
                line = line.upper()
                matches = re.findall(
                    r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
                    line)
                for match in matches:
                    line = line.replace('-', '')
                    # print(line)
                    o = "Designation--" + line
                    new.append(o)
                    remove_list.append(str(line1).replace('\n', ''))

        try:
            a = new[0].replace('\n', '')
            final.append(a)

        except IndexError:
            final.append("Designation--")

        ###################################################Phone number#################################################
        num = []
        import phonenumbers

        # print(verticaltext)
        numbers = phonenumbers.PhoneNumberMatcher(
            verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")

        for number in numbers:
            number = str(number).split(")")
            num.append(number[1])
            # num.append(number[-1])

        print(num)
        import re

        # Input list of strings
#         num =[' 7227906777Extn1204634444']


        # Define a regular expression pattern to split when text is present
        pattern = r'[a-zA-Z]+'

        # Function to split a string based on the pattern
        def split_string(text):
            return re.split(pattern, text)

        # Process each line in the list
        split_lines = [split_string(line) for line in num]

        # Flatten the list of lists into a single list
        split_lines = [item for sublist in split_lines for item in sublist]

        # Remove any empty strings
        num = [item for item in split_lines if item]

        # Print the split lines
        print(num)
        if len(num) == 0:
            final.append("ContactNumber--")
            final.append("OrganizationNumber--")
        elif len(num) > 1:
            final.append("ContactNumber--" + num[0].replace(' ', ''))
            final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
        elif len(num) == 1:
            try:
                final.append("ContactNumber--" + num[0].replace(' ', ''))
                final.append("OrganizationNumber--")
            except IndexError:
                final.append("ContactNumber--")
                final.append("OrganizationNumber--")
        print(
            '#############################################################  num #############################################################')
        print(num)
        # try:
        #     final.append("PhoneNumber--" + num[0].replace(' ', ''))
        #     remove_list.append(num[0])
        # except IndexError:
        #     pass
        # try:
        #     final.append("PhoneNumber1--" + num[1].replace(' ', ''))
        #     remove_list.append(num[1])
        # except IndexError:
        #     pass
        # try:
        #     final.append("PhoneNumber2--" + num[2].replace(' ', ''))
        #     remove_list.append(num[2])
        # except IndexError:
        #     pass

        ################################################### Email######################################################
        import re
        from email_scraper import scrape_emails
        s = list(scrape_emails(horizontaltext))
        email_id = s

        # email_id = []
        # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
        # for match in matches:
        #     email_id.append(match)

        #     # final.append('Email--' + match)
        #     email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
        #     # final.append(email_)

        #     # final.append('Email--' + email_)
        #     # remove_list.append(email_)
        if len(email_id) > 1:
            final.append(
                'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
                                                                                                                 ""))
            final.append(
                'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
                    "'",
                    ""))
        else:
            try:
                final.append(
                    'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
                        "'",
                        ""))
                final.append('OrganizationEmail--')
            except IndexError:
                final.append('ContactEmail--')
                final.append('OrganizationEmail--')

        ###############PINCODE############

        pinlst = []
        print(addrespinlst)
        import pgeocode

        # try:
        #     matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3}  \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
        #     for i in matche1:
        #         address3 = i.replace(' ', '').replace('-', '')
        #         pinlst.append(address3)
        # except IndexError:

        lst = []
        for i in num:
            i = i[1:]
            lst.append(i)

        infile = r"vtext.txt"
        outfile = r"cleaned_file.txt"
        import glob
        delete_list = lst
        # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director -  Sales  &  Business  Development']
        fin = open(infile, "r+")
        fout = open(outfile, "w+")
        for line12 in fin:
            for word in delete_list:
                line12 = line12.replace(word, "")

            fout.write(line12)
        fin.close()
        # print(line)

        # print(addrespinlst)
        import pgeocode
        #print(line12)
        import re
        matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3}  \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
        for i in matche1:
            address3 = i.replace(' ', '').replace('-', '')
            pinlst.append(address3)

        nomi = pgeocode.Nominatim('IN')
        try:
            a = nomi.query_postal_code(str(pinlst[-1]))
            # print(a)
            b = a.keys()
            c = b.values.tolist()
            d = a.tolist()
            postal_code = "PinCode1" + "--" + d[0]
            final.append(postal_code)
            country_code = c[1] + "--" + str(d[1])
            final.append(country_code)
            place_name = 'LandMark1' + "--" + str(d[2])
            final.append(place_name)
            state_name = c[3] + "--" + str(d[3])
            final.append(state_name)
            state_code = c[4] + "--" + str(d[4])
            final.append(state_code)
            county_name = 'CityName1' + "--" + str(d[5])
            final.append(county_name)

        except (IndexError, NameError):
            final.append("PinCode1--"+" ")
            final.append("country_code--")
            final.append("LandMark1--")
            final.append("state_name--")
            final.append("state_code--")
            final.append("CityName1--")

        ########################################################   json     #####################################################################

        import pandas as pd
        df = pd.DataFrame(final)
        df1 = df[0].str.split('--', expand=True)
        # print(df1)
        df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
        df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
        df1['Keys'] = df1['Keys'].str.strip()
        df1.to_csv('path123.csv', index=False)
        df2 = pd.read_csv('path123.csv')
        print(df2)
        if df2['Values'].isnull().all():
            print("Column 'Column2' is empty.")
            return 'Invalid image'
        else:
            pass
        df2 = df2.T
        df2.to_csv('path1.csv', index=False, header=False)
        df1 = pd.read_csv('path1.csv')
        df1.to_json('firstjson1.json', orient="index")
        import json
        with open('firstjson1.json', 'r') as json_file:
            json_load = json.load(json_file)
        #     # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
        nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
        # # print('--------------------------------------------------------------------------')
        # # print(nothing)
        empty = []
        import base64
        name = found
        image = open(name, 'rb')
        image_read = image.read()
        image_64_encode = base64.b64encode(image_read)
        NULL = 'null'
        empty.append("ByteData--" + (NULL).strip('""'))
        image_64_encode = image_64_encode.decode('utf-8')
        empty.append("FileData--" + str(image_64_encode))
        imagedata = name.split("/")
        imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
        imagename1 = str(imagename).split('.')
        imagename = str(imagename1[-2]).replace("[", "]")
        empty.append("FileName--" + imagename)
        empty.append("FilePath--"+ "")
        imageExtension = str(imagename1[-1]).replace("[", "]")
        empty.append("FileType--" + imageExtension)
        image.close()
        import pandas as pd
        df = pd.DataFrame(empty)
        df = df[0].str.split("--", expand=True)
        data1 = pd.DataFrame(df[0])
        data2 = pd.DataFrame(df[1])
        dt = data2.set_index(data1[0])
        dt4 = dt.T
        dictionary = dt4.to_dict(orient="index")
        list1 = []
        # list.append(a)
        list1.append(dictionary[1])
        # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
        print('--------------------')
        # print(namelist)
        import json
        # JSON data:
        x = nothing
        # python object to be appended
        y = {"image": dictionary[1]}
        # parsing JSON string:
        z = json.loads(x)
        # appending the data
        z.update(y)
        # the result is a JSON string:
        # print(json.dumps(z))
        
        zlist.append(z)
        #############################################creating csv#####################################
       # print(final)

        
        #print(imagelist)
        #final.append('image--' + str(imagelist))
       #  import requests
       #  import json

       # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
       #  url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing
       #  # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
       #  # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
       #  # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create'  # C01
       #  payload1 = json.dumps(zlist)
       #  # print('--------------------------------------------------------------------------')
       #  #print(payload1)
       #  headers = {
       #      #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a',   #dev
       #      # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
       #      # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
       #      # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f',  # c01
       #      #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
       #      'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af',


       #      'Content-Type': 'application/json'
       #  }
       #  response = requests.request("POST", url, headers=headers, data=payload1)
       #  # print("##############################################################")

       #  print(payload1)
       #  #print(zlist)
       #  # import os
       #  # if 'BusinessCards Created Successfully' in response.text:
       #  #     print('present')
       #  #     os.remove(found)
       #  # else:
       #  #     print('not present')

       #  df1.to_json('visitingcard.json')
       #  data = df1.to_json('visiting.json', orient='records')
       #  print(data)

        #return render_template('index.html')
       

    #return response.text
    #return z
    return zlist


if __name__ == "__main__":
    app.run(host='0.0.0.0', port=1112)