from flask import Flask, render_template, request, redirect, Response, send_file import os import requests import pandas as pd import pgeocode from email_scraper import scrape_emails import phonenumbers from pdfminer.high_level import extract_text import pytesseract import time import multiprocessing from PIL import Image from functools import partial from urlextract import URLExtract import pytesseract as tess from PIL import Image # from doctr.io import DocumentFile # from doctr.models import ocr_predictor # model = ocr_predictor(pretrained=True) # load tagger # import spacy # nlp_model1 = spacy.load('./ADD300_new3.0') from flair.data import Sentence from flair.models import SequenceTagger from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner") model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner") from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True) tagger = SequenceTagger.load("flair/ner-english-large") import datetime app = Flask(__name__) # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/" @app.route('/', methods=['GET']) def resume(): return render_template('index.html') #@app.route('/upload_BusinessCards', methods=["POST"]) def predict(Dataset): print('################## single card detection #######################') starttime = datetime.datetime.now() print('Execution Started at:', starttime) # print(Dataset) import os # if request.method == "POST": # if request.files: # image = request.files["image"] # try: # image.save(os.path.join( # app.config["IMAGE_UPLOADS"], image.filename)) # except IsADirectoryError: # return render_template('card.html') # # image.save(os.path.join( # # app.config["IMAGE_UPLOADS"], image.filename)) # print("Image saved") # return redirect(request.url) #url_list = request.get_json() # print(Dataset) # print(url_list) #dataset = request.get_json() # print(data) # data = {'visiting': Dataset} a=Dataset[0] #a = url_list # print(a) x = a['FileData'] # print(x) y = a['FileName'] z = a['FileType'] # CreatedBy=a['CreatedBy'] name = y + '.' + z # print(name) # print(y) # image = y.split("/") # filename=image[-1] # print(x) img_data = x.encode() import base64 with open('./upload/' + name, "wb") as fh: fh.write(base64.decodebytes(img_data)) import re import csv import glob import os # import pytesseract # import cv2 import numpy as np import glob import os import cv2 import requests final = [] # final.append('assignto--'+CreatedBy) imagelist = [] # print(found) remove_list = [] import os import glob import pdfminer # import os # ts = 0 # for file_name in glob.glob('./upload/*'): # fts = os.path.getmtime(file_name) # if fts > ts: # ts = fts # found = file_name found = './upload/' + name print(found) extension = found.split('.')[-1] # print(extension) def org_name(): print('org_name is working') import pytesseract fname = found if extension != 'pdf': img = cv2.imread(fname) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite(str(found), img) from PIL import Image im = Image.open(found) im.save("images1.png", dpi=(1200, 1200)) # import pytesseract fname = "images1.png" import pytesseract as tess from PIL import Image tess.pytesseract.tesseract_cmd=r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe" pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf") with open("demo.pdf","w+b",) as f: f.write(pdf) from pdfminer.high_level import extract_text text = extract_text('demo.pdf') # doc = DocumentFile.from_images(found) # result = model(doc) # text = result.render() # from pdfminer.high_level import extract_text # txt = extract_text('demo.pdf') else: from pdfminer.high_level import extract_text text = extract_text(fname) sentence = Sentence(text) # predict NER tags tagger.predict(sentence) # print sentence ko = (sentence) ko1 = str(ko).split("→") import pandas as pd dfg = [] try: s = ko1[1].replace("", "").replace("", "").replace("/", ":") # os.remove(found) # return 'Invalid image' dfg.append(s) df = pd.DataFrame(dfg) df = df[0] df.to_csv("df.csv", index=False) df1 = pd.read_csv("df.csv") ve = df1["0"].str.split(",") fgf = ve.to_list() dfgh = pd.DataFrame(fgf[0]) maindf = dfgh[0] # .str.split(":") # maindf.to_csv("main.csv") main1 = maindf.to_list() main1 # cv=pd.DataFrame(ve) # cv per = ["PER"] org = ["ORG"] loc = ["LOC"] organizations = [i for i in main1 for j in org if j in i] PErsons = [i for i in main1 for j in per if j in i] location = [i for i in main1 for j in loc if j in i] except IndexError: pass # ************************************* ORGANIZATION ******************************************************************** def organisation(): print('organisation working ') try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', ''))) < 4: pass else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace( 'http', '').replace(":", "").replace("/", "").upper() print(match) s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') + " /" + \ organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', '').replace( '"', '').replace( '.com', '').replace('.in', ''))) < 4: pass else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').upper() final.append("OrganizationName--" + match) # remove_list.append(match) except IndexError: company() #################################################company Name######################################## def company(): print('company list working') import re new = [] with open('test.txt', 'r+') as f: flag = False for line in f: line = line.upper() matches = re.findall( r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''', line) for i in matches: if i in line: flag = True if flag: o = "OrganizationName--" + line new.append(o) # if line.startswith('\n'): # flag = False try: a = new[0].replace('\n', '') final.append(a) except IndexError: final.append("OrganizationName--") # ************************************* CONTACT PERSON ******************************************************************* def contactpersonname(): print('contactpersonname working') try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", "") + '/' + PErsons[ 1].replace(":PER", "").replace('"', '')) except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( '"', '')) except IndexError: final.append("CONTACTPERSONNAME--") def image_to_text(): # doc = DocumentFile.from_images(found) # result = model(doc) # image_to_text.txt = result.render() # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" # img = Image.open(found) # text = tess.image_to_string(img) # image_to_text.txt = text # print(text) import cv2 img_path = found img = cv2.imread(img_path) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite(str(found), img) result = ocr.ocr(img_path, cls=True) result = result[0] txts = [line[1][0] for line in result] image_to_text.txt = "" for i in txts: if len(i) < 4: continue # print(i+"\n") image_to_text.txt = image_to_text.txt + str(i) + "\n" # print(image_to_text.txt) def pdf_to_text(): from pdfminer.high_level import extract_text pdf_to_text.txt = extract_text(found) # pdf_to_text.txt= text.replace('\n', ' ') extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] if extension in extensionlist: print('image' + extension) image_to_text() x = image_to_text.txt else: print('pdf' + extension) pdf_to_text() x = pdf_to_text.txt verticaltext = x htext = x # print('------------------------------------------------') print( '############################################################# this is verticaltext #################################################################') print(verticaltext) htext = htext.replace('\n', ' ') print( '############################################################# this is htext #############################################################') print(htext) y = x.replace('\n', ',') y = y.replace(' ', ' ') # y = y.replace(".", " .") horizontaltext = y # print('------------------------------------------------') print( '############################################################# this is horizontaltext #############################################################') print(horizontaltext) textfile = open("test123456.txt", "w") a = textfile.write(verticaltext) textfile.close() textfile = open("vtext.txt", "w") a = textfile.write(horizontaltext) textfile.close() with open('test123456.txt', 'r') as f: with open('test.txt', 'w') as w: for line in f: if line.strip().replace('|', ''): w.write(line) ###########################ADDRESS################################## addrespinlst = [] def splitaddress(): import re textaddress = htext.replace('\n', ' ') # print(textaddress) address1 = (textaddress.partition(",")[0]) words = address1.split() address1 = words[-1] addre = (htext.partition(",")[2]) a = addre.replace('\n', ' ').replace('\x0c', '') addre = (a.partition(",")[2]) matches = re.findall( r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b', a) for match in matches: address2 = match address2 = str(address2) address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', '') matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) for address3 in matches: pass try: Address = address1 + "," + address2 + "," + address3 final.append('ADDRESS--' + Address) addrespinlst.append(Address) except NameError: print( '############################################################ Addressmodelworking #############################################################') # doc = nlp_model1(textaddress) # addlist = [] # for ent in doc.ents: # name = (f'{ent.label_.upper():{10}}--{ent.text}') # addlist.append(name) # try: # Address = addlist[0] # final.append(Address) # addrespinlst.append(Address) # remove_list.append( # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace( # "ADDRESS--", # "")) # except IndexError: # final.append("ADDRESS--") pass ################################################## website####################################################### # import re # url = [] # matches = re.findall(r'www.*', verticaltext) # for match in matches: # if (match.count('.')) == 1: # a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) # url.append(a_string1) # else: # final.append("Urls--" + match) # if len(url)==0: # from urlextract import URLExtract # extractor = URLExtract() # urls = extractor.find_urls(verticaltext) # try: # urllist = urls[0] # final.append("Urls--"+urllist) # url.append(urllist) # except IndexError: # final.append("Urls--") # for match in matches: # if (match.count('.')) == 1: # a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) # url.append(a_string1) # else: # final.append("Urls--" + match) # url.append(match) # remove_list.append(match) # else: # final.append("Urls--" ) ################################################## website####################################################### import re # final=[] url = [] urlfinal = [] matches = re.findall(r'www.*', verticaltext) for match in matches: if (match.count('.')) == 1: a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) url.append(a_string1) else: url.append(match) if len(url) == 0: from urlextract import URLExtract extractor = URLExtract() urls = extractor.find_urls(verticaltext) try: urllist = urls[0] url.append(urllist) url.append(urllist) except IndexError: pass for match in matches: if (match.count('.')) == 1: a_string1 = match.replace("www", "www.") url.append(a_string1) # url.append(a_string1) else: url.append(match) url.append(match) else: pass try: test_string = url[0] test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"] res = [ele for ele in test_list if (ele in test_string)] if len(res) == 0: print('no match') final.append('urls--') else: print('matched') final.append('urls--' + url[0]) urlfinal.append(url[0]) except IndexError: final.append('urls--') print( '############################################################# url #############################################################') print(url) #######organisation and contact################ # def company_url(): # # print('--url--') # # print(url) # try: # match = str(url[0]).lower() # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper() # final.append("OrganizationName--" + match) # # remove_list.append(match) # except IndexError: # org_name() # organisation() # final.append("OrganizationName--") # make example sentence # print(horizontaltext) sentence = Sentence(verticaltext) # predict NER tags tagger.predict(sentence) # print sentence ko = (sentence) ko1 = str(ko).split("→") import pandas as pd dfg = [] try: s = ko1[1].replace("", "").replace("", "").replace("/", ":") except IndexError: os.remove(found) return 'Invalid image' dfg.append(s) df = pd.DataFrame(dfg) df = df[0] df.to_csv("df.csv", index=False) df1 = pd.read_csv("df.csv") ve = df1["0"].str.split(",") fgf = ve.to_list() dfgh = pd.DataFrame(fgf[0]) maindf = dfgh[0] # .str.split(":") # maindf.to_csv("main.csv") main1 = maindf.to_list() main1 # cv=pd.DataFrame(ve) # cv per = ["PER"] org = ["ORG"] loc = ["LOC"] organizations = [i for i in main1 for j in org if j in i] PErsons = [i for i in main1 for j in per if j in i] location = [i for i in main1 for j in loc if j in i] # ************************************* ORGANIZATION ******************************************************************** try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4: pass # company_url() else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace('https', '').replace( 'http', '').replace(":", "").replace("/", "").upper() print(match) s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( '.com', '') + " /" + \ organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', '').replace( '"', '').replace( '.com', ''))) < 4: pass # company_url() else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( '.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: org_name() organisation() # final.append("OrganizationName--") # ************************************* CONTACT PERSON ******************************************************************* try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", "") + PErsons[ 1].replace(":PER", "").replace('"', '')) except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace('"', '')) except IndexError: org_name() contactpersonname() # final.append("CONTACTPERSONNAME--") ###############address flair##################### try: print( '############################################################# address new code #############################################################') loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat'] loclst = [i for i in loactionlst if i in htext.lower()] textaddress = htext textaddress = textaddress.replace("|", ",") textaddress = textaddress.lower() nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple") grop = nlp(textaddress) citycountry = [] print('########################### city or country name ###########################') d = grop[-1] if d['entity_group'] == "COUNTRY": print(d["word"]) citycountry.append(d["word"]) elif d['entity_group'] == "CITY": print(d["word"]) citycountry.append(d["word"]) try: address1 = loclst[0] except IndexError: address1 = (textaddress.partition(",")[0]) words = address1.split() address1 = words[-1] star_location = address1.lower() end_location = citycountry[0].replace("#", "") start = star_location end = end_location s = textaddress.lower() middle_address = (s.split(start))[-1].split(end)[0] Address = start + middle_address + end Address = Address.replace('--', '').title() print(Address) if Address.count(',') < 2: splitaddress() else: final.append('ADDRESS--' + Address) # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '') # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '') # d1 = star_location.split() # d2 = end_location.split() # d3 = d1[0] # d4 = d2[0] # start = d3 # end = d4 # s = horizontaltext # middle_address = ((s.split(start))[1].split(end)[0]) # Address = d3 + middle_address + d4 # final.append('ADDRESS--' + Address) # addrespinlst.append(Address) except IndexError: splitaddress() ########################################## Designation ########################################### import re new = [] with open('test.txt', 'r') as f: flag = False for line in f: line1 = line line = line.upper() matches = re.findall( r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''', line) for match in matches: line = line.replace('-', '') # print(line) o = "Designation--" + line new.append(o) remove_list.append(str(line1).replace('\n', '')) try: a = new[0].replace('\n', '') final.append(a) except IndexError: final.append("Designation--") ###################################################Phone number################################################# num = [] import phonenumbers # print(verticaltext) numbers = phonenumbers.PhoneNumberMatcher( verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN") for number in numbers: number = str(number).split(")") num.append(number[1]) # num.append(number[-1]) if len(num) == 0: final.append("ContactNumber--") final.append("OrganizationNumber--") elif len(num) > 1: final.append("ContactNumber--" + num[0].replace(' ', '')) final.append("OrganizationNumber--" + num[-1].replace(' ', '')) elif len(num) == 1: try: final.append("ContactNumber--" + num[0].replace(' ', '')) final.append("OrganizationNumber--") except IndexError: final.append("ContactNumber--") final.append("OrganizationNumber--") print( '############################################################# num #############################################################') print(num) # try: # final.append("PhoneNumber--" + num[0].replace(' ', '')) # remove_list.append(num[0]) # except IndexError: # pass # try: # final.append("PhoneNumber1--" + num[1].replace(' ', '')) # remove_list.append(num[1]) # except IndexError: # pass # try: # final.append("PhoneNumber2--" + num[2].replace(' ', '')) # remove_list.append(num[2]) # except IndexError: # pass ################################################### Email###################################################### import re from email_scraper import scrape_emails s = list(scrape_emails(horizontaltext)) email_id = s # email_id = [] # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext) # for match in matches: # email_id.append(match) # # final.append('Email--' + match) # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "") # # final.append(email_) # # final.append('Email--' + email_) # # remove_list.append(email_) if len(email_id) > 1: final.append( 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) final.append( 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) else: try: final.append( 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) final.append('OrganizationEmail--') except IndexError: final.append('ContactEmail--') final.append('OrganizationEmail--') ###############PINCODE############ pinlst = [] print(addrespinlst) import pgeocode # try: # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0]) # for i in matche1: # address3 = i.replace(' ', '').replace('-', '') # pinlst.append(address3) # except IndexError: lst = [] for i in num: i = i[1:] lst.append(i) infile = r"vtext.txt" outfile = r"cleaned_file.txt" import glob delete_list = lst # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development'] fin = open(infile, "r+") fout = open(outfile, "w+") for line12 in fin: for word in delete_list: line12 = line12.replace(word, "") fout.write(line12) fin.close() # print(line) # print(addrespinlst) import pgeocode print(line12) import re matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) for i in matche1: address3 = i.replace(' ', '').replace('-', '') pinlst.append(address3) nomi = pgeocode.Nominatim('IN') try: a = nomi.query_postal_code(str(pinlst[-1])) # print(a) b = a.keys() c = b.values.tolist() d = a.tolist() postal_code = "PinCode1" + "--" + d[0] final.append(postal_code) country_code = c[1] + "--" + str(d[1]) final.append(country_code) place_name = 'LandMark1' + "--" + str(d[2]) final.append(place_name) state_name = c[3] + "--" + str(d[3]) final.append(state_name) state_code = c[4] + "--" + str(d[4]) final.append(state_code) county_name = 'CityName1' + "--" + str(d[5]) final.append(county_name) except (IndexError, NameError): final.append("PinCode1--") final.append("country_code--") final.append("LandMark1--") final.append("state_name--") final.append("state_code--") final.append("CityName1--") ######################################################## json ##################################################################### import pandas as pd df = pd.DataFrame(final) df1 = df[0].str.split('--', expand=True) # print(df1) df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True) df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True) df1['Keys']=df1['Keys'].str.strip() df1.to_csv('path12.csv', index=False) df2 = pd.read_csv('path12.csv') print(final) print(df2) df2 = df2.T df2.to_csv('path.csv', index=False, header=False) df1 = pd.read_csv('path.csv') df1.to_json('firstjson.json', orient="index") import json with open('firstjson.json', 'r') as json_file: json_load = json.load(json_file) # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') # print('--------------------------------------------------------------------------') # print(nothing) empty = [] import base64 name = found image = open(name, 'rb') image_read = image.read() image_64_encode = base64.b64encode(image_read) NULL = 'null' empty.append("ByteData--" + (NULL).strip('""')) image_64_encode = image_64_encode.decode('utf-8') empty.append("FileData--" + str(image_64_encode)) imagedata = name.split("/") imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") imagename1 = str(imagename).split('.') imagename = str(imagename1[-2]).replace("[", "]") empty.append("FileName--" + imagename) empty.append("FilePath--" + found) imageExtension = str(imagename1[-1]).replace("[", "]") empty.append("FileType--" + imageExtension) image.close() import pandas as pd df = pd.DataFrame(empty) df = df[0].str.split("--", expand=True) data1 = pd.DataFrame(df[0]) data2 = pd.DataFrame(df[1]) dt = data2.set_index(data1[0]) dt4 = dt.T dictionary = dt4.to_dict(orient="index") list1 = [] # list.append(a) list1.append(dictionary[1]) # # final.append("image--"+str(dictionary[1]).replace("\'",'"')) print('--------------------') # print(namelist) import json # JSON data: x = nothing # python object to be appended y = {"image": dictionary[1]} # parsing JSON string: z = json.loads(x) # appending the data z.update(y) # the result is a JSON string: # print(json.dumps(z)) # print('##########################') # #print(z) # print('##########################') # #############################################creating csv##################################### # # print(final) # # print(imagelist) # final.append('image--' + str(imagelist)) # # import requests # import json # with open('visitingcard1.json', 'r') as json_file: # json_load = json.load(json_file) # url = "https://demo.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 payload1 = json.dumps(z) # print('--------------------------------------------------------------------------') # print(payload1) headers = { 'Authorization': 'stat db226c95fae04943aa3e3c03a4381b2a', # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demosss 'Content-Type': 'application/json' } response = requests.request("POST", url, headers=headers, data=payload1) print("##############################################################") # print(payload1) print(response.text) if 'BusinessCards Created Successfully' in response.text: print('present') os.remove(found) else: print('not present') # df1.to_json('visitingcard.json') # data = df1.to_json('visiting.json', orient='records') # print(data) # return render_template('index.html') # print('Time Taken:',total) endtime = datetime.datetime.now() print('Completed at:', endtime) print(starttime) print(endtime) print('--------------------------') # z=end-start # print('Time Taken:',z) # return response.text # return 'done' return response.text #@app.route('/upload_BusinessCards', methods=["POST"]) #@app.route('/multiplecards', methods=["POST"]) def multiplecards(Dataset): print('################## multiple card detection #######################') #print(Dataset) #dataset = request.get_json() # print(data) data = {'visiting': Dataset} for i in data['visiting']: import time #time.sleep(1) a = i x = a['FileData'] # print(x) y = a['FileName'] z = a['FileType'] # CreatedBy=a['CreatedBy'] name = y + '.' + z # print(name) # print(y) # image = y.split("/") # filename=image[-1] # print(x) img_data = x.encode() import base64 with open('./multicards/' + name, "wb") as fh: fh.write(base64.decodebytes(img_data)) # print(i) # import os # import glob # for i in glob.glob('./multipleupload/*'): found = './multicards/' + name print(found) extension = found.split('.')[-1] # for root, dirs, fils in os.glob('./multipleupload'): # for name in files: # foundfile= os.path.join(root, name) # print(foundfile) import re import csv import glob import os # import pytesseract # import cv2 import numpy as np import glob import os import cv2 import requests final = [] # final.append('assignto--'+CreatedBy) imagelist = [] # print(found) remove_list = [] import os import glob import pdfminer # import os # ts = 0 # for file_name in glob.glob('./upload/*'): # fts = os.path.getmtime(file_name) # if fts > ts: # ts = fts # found = file_name # print(found) # print(extension) def org_name(): print('org_name is working') import pytesseract fname = found if extension != 'pdf': img = cv2.imread(fname) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite(str(found), img) from PIL import Image im = Image.open(found) im.save("images1.png", dpi=(1200, 1200)) # import pytesseract fname = "images1.png" import pytesseract as tess from PIL import Image tess.pytesseract.tesseract_cmd=r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe" pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf") with open("demo.pdf","w+b",) as f: f.write(pdf) from pdfminer.high_level import extract_text text = extract_text('demo.pdf') # doc = DocumentFile.from_images(found) # result = model(doc) # text = result.render() # from pdfminer.high_level import extract_text # txt = extract_text('demo.pdf') else: from pdfminer.high_level import extract_text text = extract_text(fname) sentence = Sentence(text) # predict NER tags tagger.predict(sentence) # print sentence ko = (sentence) ko1 = str(ko).split("→") import pandas as pd dfg = [] try: s = ko1[1].replace("", "").replace("", "").replace("/", ":") # os.remove(found) # return 'Invalid image' dfg.append(s) df = pd.DataFrame(dfg) df = df[0] df.to_csv("df.csv", index=False) df1 = pd.read_csv("df.csv") ve = df1["0"].str.split(",") fgf = ve.to_list() dfgh = pd.DataFrame(fgf[0]) maindf = dfgh[0] # .str.split(":") # maindf.to_csv("main.csv") main1 = maindf.to_list() main1 # cv=pd.DataFrame(ve) # cv per = ["PER"] org = ["ORG"] loc = ["LOC"] organizations = [i for i in main1 for j in org if j in i] PErsons = [i for i in main1 for j in per if j in i] location = [i for i in main1 for j in loc if j in i] except IndexError: pass # ************************************* ORGANIZATION ******************************************************************** def organisation(): print('organisation working ') try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', ''))) < 4: pass else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace( 'http', '').replace(":", "").replace("/", "").upper() print(match) s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') + " /" + \ organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', '').replace( '"', '').replace( '.com', '').replace('.in', ''))) < 4: pass else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace( '[', '').replace( ']', '').replace( '.com', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').upper() final.append("OrganizationName--" + match) # remove_list.append(match) except IndexError: company() #################################################company Name######################################## def company(): print('company list working') import re new = [] with open('test.txt', 'r+') as f: flag = False for line in f: line = line.upper() matches = re.findall( r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''', line) for i in matches: if i in line: flag = True if flag: o = "OrganizationName--" + line new.append(o) # if line.startswith('\n'): # flag = False try: a = new[0].replace('\n', '') final.append(a) except IndexError: final.append("OrganizationName--") # ************************************* CONTACT PERSON ******************************************************************* def contactpersonname(): print('contactpersonname working') try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace( "]", "") + '/' + PErsons[ 1].replace(":PER", "").replace('"', '')) except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( '"', '')) except IndexError: final.append("CONTACTPERSONNAME--") def image_to_text(): # doc = DocumentFile.from_images(found) # result = model(doc) # image_to_text.txt = result.render() # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" # img = Image.open(found) # text = tess.image_to_string(img) # image_to_text.txt = text # print(text) import cv2 img_path = found img = cv2.imread(img_path) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite(str(found), img) result = ocr.ocr(img_path, cls=True) result = result[0] txts = [line[1][0] for line in result] image_to_text.txt = "" for i in txts: if len(i) < 4: continue # print(i+"\n") image_to_text.txt = image_to_text.txt + str(i) + "\n" # print(image_to_text.txt) def pdf_to_text(): from pdfminer.high_level import extract_text pdf_to_text.txt = extract_text(found) # pdf_to_text.txt= text.replace('\n', ' ') extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] if extension in extensionlist: print('image' + extension) image_to_text() x = image_to_text.txt else: print('pdf' + extension) pdf_to_text() x = pdf_to_text.txt verticaltext = x htext = x # print('------------------------------------------------') print( '############################################################# this is verticaltext #################################################################') print(verticaltext) htext = htext.replace('\n', ' ') print( '############################################################# this is htext #############################################################') print(htext) y = x.replace('\n', ',') y = y.replace(' ', ' ') # y = y.replace(".", " .") horizontaltext = y # print('------------------------------------------------') print( '############################################################# this is horizontaltext #############################################################') print(horizontaltext) textfile = open("test123456.txt", "w") a = textfile.write(verticaltext) textfile.close() textfile = open("vtext.txt", "w") a = textfile.write(horizontaltext) textfile.close() with open('test123456.txt', 'r') as f: with open('test.txt', 'w') as w: for line in f: if line.strip().replace('|', ''): w.write(line) ###########################ADDRESS################################## addrespinlst = [] def splitaddress(): import re textaddress = htext.replace('\n', ' ') # print(textaddress) address1 = (textaddress.partition(",")[0]) words = address1.split() address1 = words[-1] addre = (htext.partition(",")[2]) a = addre.replace('\n', ' ').replace('\x0c', '') addre = (a.partition(",")[2]) matches = re.findall( r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b', a) for match in matches: address2 = match address2 = str(address2) address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', '') matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) for address3 in matches: pass try: Address = address1 + "," + address2 + "," + address3 final.append('ADDRESS--' + Address) addrespinlst.append(Address) except NameError: print( '############################################################ Addressmodelworking #############################################################') # doc = nlp_model1(textaddress) # addlist = [] # for ent in doc.ents: # name = (f'{ent.label_.upper():{10}}--{ent.text}') # addlist.append(name) # try: # Address = addlist[0] # final.append(Address) # addrespinlst.append(Address) # remove_list.append( # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace( # "ADDRESS--", # "")) # except IndexError: # final.append("ADDRESS--") pass ################################################## website####################################################### # import re # url = [] # matches = re.findall(r'www.*', verticaltext) # for match in matches: # if (match.count('.')) == 1: # a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) # url.append(a_string1) # else: # final.append("Urls--" + match) # if len(url)==0: # from urlextract import URLExtract # extractor = URLExtract() # urls = extractor.find_urls(verticaltext) # try: # urllist = urls[0] # final.append("Urls--"+urllist) # url.append(urllist) # except IndexError: # final.append("Urls--") # for match in matches: # if (match.count('.')) == 1: # a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) # url.append(a_string1) # else: # final.append("Urls--" + match) # url.append(match) # remove_list.append(match) # else: # final.append("Urls--" ) ################################################## website####################################################### import re # final=[] url = [] urlfinal = [] matches = re.findall(r'www.*', verticaltext) for match in matches: if (match.count('.')) == 1: a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) url.append(a_string1) else: url.append(match) if len(url) == 0: from urlextract import URLExtract extractor = URLExtract() urls = extractor.find_urls(verticaltext) try: urllist = urls[0] url.append(urllist) url.append(urllist) except IndexError: pass for match in matches: if (match.count('.')) == 1: a_string1 = match.replace("www", "www.") url.append(a_string1) # url.append(a_string1) else: url.append(match) url.append(match) else: pass try: test_string = url[0] test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"] res = [ele for ele in test_list if (ele in test_string)] if len(res) == 0: print('no match') final.append('urls--') else: print('matched') final.append('urls--' + url[0]) urlfinal.append(url[0]) except IndexError: final.append('urls--') print( '############################################################# url #############################################################') print(url) #######organisation and contact################ # def company_url(): # # print('--url--') # # print(url) # try: # match = str(url[0]).lower() # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper() # final.append("OrganizationName--" + match) # # remove_list.append(match) # except IndexError: # org_name() # organisation() # final.append("OrganizationName--") # make example sentence # print(horizontaltext) sentence = Sentence(verticaltext) # predict NER tags tagger.predict(sentence) # print sentence ko = (sentence) ko1 = str(ko).split("→") import pandas as pd dfg = [] try: s = ko1[1].replace("", "").replace("", "").replace("/", ":") except IndexError: os.remove(found) return 'Invalid image' dfg.append(s) df = pd.DataFrame(dfg) df = df[0] df.to_csv("df.csv", index=False) df1 = pd.read_csv("df.csv") ve = df1["0"].str.split(",") fgf = ve.to_list() dfgh = pd.DataFrame(fgf[0]) maindf = dfgh[0] # .str.split(":") # maindf.to_csv("main.csv") main1 = maindf.to_list() main1 # cv=pd.DataFrame(ve) # cv per = ["PER"] org = ["ORG"] loc = ["LOC"] organizations = [i for i in main1 for j in org if j in i] PErsons = [i for i in main1 for j in per if j in i] location = [i for i in main1 for j in loc if j in i] # ************************************* ORGANIZATION ******************************************************************** try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4: pass # company_url() else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace( 'http', '').replace(":", "").replace("/", "").upper() print(match) s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( '.com', '') + " /" + \ organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', '').replace( '"', '').replace( '.com', ''))) < 4: pass # company_url() else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( '.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: org_name() organisation() # final.append("OrganizationName--") # ************************************* CONTACT PERSON ******************************************************************* try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", "") + PErsons[ 1].replace(":PER", "").replace('"', '')) except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( '"', '')) except IndexError: org_name() contactpersonname() # final.append("CONTACTPERSONNAME--") ###############address flair##################### try: print( '############################################################# address new code #############################################################') loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat'] loclst = [i for i in loactionlst if i in htext.lower()] textaddress = htext textaddress = textaddress.replace("|", ",") textaddress = textaddress.lower() nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple") grop = nlp(textaddress) citycountry = [] print('########################### city or country name ###########################') d = grop[-1] if d['entity_group'] == "COUNTRY": print(d["word"]) citycountry.append(d["word"]) elif d['entity_group'] == "CITY": print(d["word"]) citycountry.append(d["word"]) try: address1 = loclst[0] except IndexError: address1 = (textaddress.partition(",")[0]) words = address1.split() address1 = words[-1] star_location = address1.lower() end_location = citycountry[0].replace("#", "") start = star_location end = end_location s = textaddress.lower() middle_address = (s.split(start))[-1].split(end)[0] Address = start + middle_address + end Address = Address.replace('--', '').title() print(Address) if Address.count(',') < 2: splitaddress() else: final.append('ADDRESS--' + Address) # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '') # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '') # d1 = star_location.split() # d2 = end_location.split() # d3 = d1[0] # d4 = d2[0] # start = d3 # end = d4 # s = horizontaltext # middle_address = ((s.split(start))[1].split(end)[0]) # Address = d3 + middle_address + d4 # final.append('ADDRESS--' + Address) # addrespinlst.append(Address) except IndexError: splitaddress() ########################################## Designation ########################################### import re new = [] with open('test.txt', 'r') as f: flag = False for line in f: line1 = line line = line.upper() matches = re.findall( r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''', line) for match in matches: line = line.replace('-', '') # print(line) o = "Designation--" + line new.append(o) remove_list.append(str(line1).replace('\n', '')) try: a = new[0].replace('\n', '') final.append(a) except IndexError: final.append("Designation--") ###################################################Phone number################################################# num = [] import phonenumbers # print(verticaltext) numbers = phonenumbers.PhoneNumberMatcher( verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN") for number in numbers: number = str(number).split(")") num.append(number[1]) # num.append(number[-1]) if len(num) == 0: final.append("ContactNumber--") final.append("OrganizationNumber--") elif len(num) > 1: final.append("ContactNumber--" + num[0].replace(' ', '')) final.append("OrganizationNumber--" + num[-1].replace(' ', '')) elif len(num) == 1: try: final.append("ContactNumber--" + num[0].replace(' ', '')) final.append("OrganizationNumber--") except IndexError: final.append("ContactNumber--") final.append("OrganizationNumber--") print( '############################################################# num #############################################################') print(num) # try: # final.append("PhoneNumber--" + num[0].replace(' ', '')) # remove_list.append(num[0]) # except IndexError: # pass # try: # final.append("PhoneNumber1--" + num[1].replace(' ', '')) # remove_list.append(num[1]) # except IndexError: # pass # try: # final.append("PhoneNumber2--" + num[2].replace(' ', '')) # remove_list.append(num[2]) # except IndexError: # pass ################################################### Email###################################################### import re from email_scraper import scrape_emails s = list(scrape_emails(horizontaltext)) email_id = s # email_id = [] # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext) # for match in matches: # email_id.append(match) # # final.append('Email--' + match) # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "") # # final.append(email_) # # final.append('Email--' + email_) # # remove_list.append(email_) if len(email_id) > 1: final.append( 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) final.append( 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( "'", "")) else: try: final.append( 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( "'", "")) final.append('OrganizationEmail--') except IndexError: final.append('ContactEmail--') final.append('OrganizationEmail--') ###############PINCODE############ pinlst = [] print(addrespinlst) import pgeocode # try: # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0]) # for i in matche1: # address3 = i.replace(' ', '').replace('-', '') # pinlst.append(address3) # except IndexError: lst = [] for i in num: i = i[1:] lst.append(i) infile = r"vtext.txt" outfile = r"cleaned_file.txt" import glob delete_list = lst # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development'] fin = open(infile, "r+") fout = open(outfile, "w+") for line12 in fin: for word in delete_list: line12 = line12.replace(word, "") fout.write(line12) fin.close() # print(line) # print(addrespinlst) import pgeocode print(line12) import re matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) for i in matche1: address3 = i.replace(' ', '').replace('-', '') pinlst.append(address3) nomi = pgeocode.Nominatim('IN') try: a = nomi.query_postal_code(str(pinlst[-1])) # print(a) b = a.keys() c = b.values.tolist() d = a.tolist() postal_code = "PinCode1" + "--" + d[0] final.append(postal_code) country_code = c[1] + "--" + str(d[1]) final.append(country_code) place_name = 'LandMark1' + "--" + str(d[2]) final.append(place_name) state_name = c[3] + "--" + str(d[3]) final.append(state_name) state_code = c[4] + "--" + str(d[4]) final.append(state_code) county_name = 'CityName1' + "--" + str(d[5]) final.append(county_name) except (IndexError, NameError): final.append("PinCode1--") final.append("country_code--") final.append("LandMark1--") final.append("state_name--") final.append("state_code--") final.append("CityName1--") ######################################################## json ##################################################################### import pandas as pd df = pd.DataFrame(final) df1 = df[0].str.split('--', expand=True) # print(df1) df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True) df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True) df1['Keys']=df1['Keys'].str.strip() df1.to_csv('path123.csv', index=False) df2 = pd.read_csv('path123.csv') print(df2) df2 = df2.T df2.to_csv('path1.csv', index=False, header=False) df1 = pd.read_csv('path1.csv') df1.to_json('firstjson1.json', orient="index") import json with open('firstjson1.json', 'r') as json_file: json_load = json.load(json_file) # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') # # print('--------------------------------------------------------------------------') # # print(nothing) empty = [] import base64 name = found image = open(name, 'rb') image_read = image.read() image_64_encode = base64.b64encode(image_read) NULL = 'null' empty.append("ByteData--" + (NULL).strip('""')) image_64_encode = image_64_encode.decode('utf-8') empty.append("FileData--" + str(image_64_encode)) imagedata = name.split("/") imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") imagename1 = str(imagename).split('.') imagename = str(imagename1[-2]).replace("[", "]") empty.append("FileName--" + imagename) empty.append("FilePath--" + found) imageExtension = str(imagename1[-1]).replace("[", "]") empty.append("FileType--" + imageExtension) image.close() import pandas as pd df = pd.DataFrame(empty) df = df[0].str.split("--", expand=True) data1 = pd.DataFrame(df[0]) data2 = pd.DataFrame(df[1]) dt = data2.set_index(data1[0]) dt4 = dt.T dictionary = dt4.to_dict(orient="index") list1 = [] # list.append(a) list1.append(dictionary[1]) # # final.append("image--"+str(dictionary[1]).replace("\'",'"')) print('--------------------') # print(namelist) import json # JSON data: x = nothing # python object to be appended y = {"image": dictionary[1]} # parsing JSON string: z = json.loads(x) # appending the data z.update(y) # the result is a JSON string: # print(json.dumps(z)) #############################################creating csv##################################### # print(final) # print(imagelist) # final.append('image--'+str(imagelist)) # import requests # import json # # with open('visitingcard1.json', 'r') as json_file: # # json_load = json.load(json_file) # url = "https://demo.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 payload1 = json.dumps(z) # print('--------------------------------------------------------------------------') # print(payload1) headers = { 'Authorization': 'stat db226c95fae04943aa3e3c03a4381b2a', # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo 'Content-Type': 'application/json' } response = requests.request("POST", url, headers=headers, data=payload1) # print("##############################################################") # #print(payload1) print(response.text) import os if 'BusinessCards Created Successfully' in response.text: print('present') os.remove(found) else: print('not present') # df1.to_json('visitingcard.json') # data = df1.to_json('visiting.json', orient='records') # print(data) # return render_template('index.html') # files = glob.glob('./upload/*') # for f in files: # os.remove(f) # print('Time Taken:',total) return response.text # return 'done' # # return send_file(p,as_attachment=True) # @app.route('/upload_BusinessCards', methods=["POST"]) # def upload_BusinessCards(): # if __name__ == "__main__": # url_list = [] # Dataset = request.get_json() # print("8888888888888888888888888888888888888888888888888888888888888888888888888888888888") # #print(Dataset) # # id = "100013660000125" # url_list.append(Dataset) # # multiprocessing # with multiprocessing.Pool(processes=1) as pool: # # try: # results = pool.map(predict, url_list) # # except IndexError: # # return 'Invalid image' # # results.clear() # # a=results[0] # pool.close() # return results[0] @app.route('/upload_BusinessCards', methods=["POST"]) def mainfunction(): Dataset = request.get_json() if len(Dataset)==1: # predict(Dataset) return predict(Dataset) else: # multiplecards(Dataset) return multiplecards(Dataset) if __name__ == "__main__": app.run(host='0.0.0.0',port=1112)