From 8b1964e0b61d2be96579b19f76d7138c90248f49 Mon Sep 17 00:00:00 2001 From: SadhulaSaiKumar Date: Tue, 2 May 2023 05:32:46 +0000 Subject: [PATCH] Update 'Business_cards/Business_cards.py' --- Business_cards/Business_cards.py | 2232 +++++++++++++++--------------- 1 file changed, 1112 insertions(+), 1120 deletions(-) diff --git a/Business_cards/Business_cards.py b/Business_cards/Business_cards.py index fad586e..178f35e 100644 --- a/Business_cards/Business_cards.py +++ b/Business_cards/Business_cards.py @@ -1,1121 +1,1113 @@ -from flask import Flask, render_template, request, redirect, Response, send_file -import os -import openai -import requests -import pandas as pd -import pgeocode -from email_scraper import scrape_emails -import phonenumbers -from pdfminer.high_level import extract_text -import pytesseract -import time -import multiprocessing -from PIL import Image -from functools import partial -from urlextract import URLExtract -import pytesseract as tess -from PIL import Image -# from doctr.io import DocumentFile -# from doctr.models import ocr_predictor -# model = ocr_predictor(pretrained=True) -# load tagger -###################################################### -import os -import glob - -from pytesseract import * -import shutil -import cv2 -import matplotlib -from werkzeug.utils import secure_filename -import requests -import spacy -import time -import multiprocessing -from PIL import Image -from functools import partial -nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME") -nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2") -from flask import Flask, render_template, request, redirect, Response, send_file - -import pandas as pd -################################################################ -Current_Working_Directory=os.getcwd() -Current_Working_Directory=Current_Working_Directory.replace("\\","/") -nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p") - -################################################################ -# import spacy - -# nlp_model1 = spacy.load('./ADD3001.2') -from flair.data import Sentence -from flair.models import SequenceTagger -from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline - -tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner") -model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner") - -from paddleocr import PaddleOCR, draw_ocr - -ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True) -tagger = SequenceTagger.load("flair/ner-english-large") - -import datetime - -app = Flask(__name__) - - -# app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/" - -@app.route('/', methods=['GET']) -def card(): - return render_template('card.html') - - - -@app.route('/upload_BusinessCards', methods=["POST"]) -# @app.route('/multiplecards', methods=["POST"]) -def multiplecards(): - # print('################## multiple card detection #######################') - # print(Dataset) - datalist=[] - Dataset = request.get_json() - # print(data) - #datalist.append(Dataset) - data = {'visiting': Dataset} - for i in data['visiting']: - import time - # time.sleep(1) - a = i - x = a['FileData'] - # print(x) - y = a['FileName'] - z = a['FileType'] - # CreatedBy=a['CreatedBy'] - - name = y + '.' + z - # print(name) - # print(y) - # image = y.split("/") - # filename=image[-1] - - # print(x) - img_data = x.encode() - - import base64 - with open('./multicards/' + name, "wb") as fh: - fh.write(base64.decodebytes(img_data)) - # print(i) - - # import os - # import glob - # for i in glob.glob('./multipleupload/*'): - - found = './multicards/' + name - print(found) - extension = found.split('.')[-1] - - # for root, dirs, fils in os.glob('./multipleupload'): - # for name in files: - # foundfile= os.path.join(root, name) - # print(foundfile) - - import re - import csv - import glob - import os - # import pytesseract - # import cv2 - import numpy as np - import glob - import os - import cv2 - import requests - final = [] - # final.append('assignto--'+CreatedBy) - imagelist = [] - # print(found) - remove_list = [] - import os - import glob - import pdfminer - - # import os - # ts = 0 - # for file_name in glob.glob('./upload/*'): - # fts = os.path.getmtime(file_name) - # if fts > ts: - # ts = fts - # found = file_name - # print(found) - - # print(extension) - - def org_name(): - print('org_name is working') - import pytesseract - fname = found - if extension != 'pdf': - - img = cv2.imread(fname) - img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - cv2.imwrite(str(found), img) - from PIL import Image - im = Image.open(found) - im.save("images1.png", dpi=(1200, 1200)) - # import pytesseract - fname = "images1.png" - import pytesseract as tess - from PIL import Image - - tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" - pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf") - with open("demo.pdf", "w+b", ) as f: - f.write(pdf) - - from pdfminer.high_level import extract_text - text = extract_text('demo.pdf') - # doc = DocumentFile.from_images(found) - # result = model(doc) - # text = result.render() - - # from pdfminer.high_level import extract_text - # txt = extract_text('demo.pdf') - else: - from pdfminer.high_level import extract_text - text = extract_text(fname) - - sentence = Sentence(text) - - # predict NER tags - tagger.predict(sentence) - - # print sentence - ko = (sentence) - - ko1 = str(ko).split("→") - import pandas as pd - - dfg = [] - try: - s = ko1[1].replace("", "").replace("", "").replace("/", ":") - - # os.remove(found) - # return 'Invalid image' - dfg.append(s) - df = pd.DataFrame(dfg) - df = df[0] - - df.to_csv("df.csv", index=False) - - df1 = pd.read_csv("df.csv") - ve = df1["0"].str.split(",") - fgf = ve.to_list() - dfgh = pd.DataFrame(fgf[0]) - maindf = dfgh[0] # .str.split(":") - # maindf.to_csv("main.csv") - - main1 = maindf.to_list() - main1 - # cv=pd.DataFrame(ve) - # cv - per = ["PER"] - org = ["ORG"] - loc = ["LOC"] - organizations = [i for i in main1 for j in org if j in i] - PErsons = [i for i in main1 for j in per if j in i] - location = [i for i in main1 for j in loc if j in i] - except IndexError: - pass - - # ************************************* ORGANIZATION ******************************************************************** - - def organisation(): - print('organisation working ') - try: - if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', - '').replace( - '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"', - '').replace( - '.com', ''))) < 4: - pass - - - else: - - match = str(urlfinal[0]).lower() - match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( - 'https', - '').replace( - 'http', '').replace(":", "").replace("/", "").upper() - print(match) - - s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', - '') + " /" + \ - organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') - s1 = s1g.upper() - s2 = match.upper() - from difflib import SequenceMatcher - print(s1) - print(s2) - print(SequenceMatcher(None, s1, s2).ratio()) - if SequenceMatcher(None, s1, s2).ratio() >= 0.10: - # and SequenceMatcher(None, s1, s2).ratio()<0.50: - final.append( - "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', - '').replace( - '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', - '').replace( - '.com', - '').replace(']', '')) - else: - final.append("OrganizationName--" + s2) - - except IndexError: - try: - if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', - '').replace( - '"', - '').replace( - '.com', '').replace('.in', ''))) < 4: - pass - - else: - match = str(urlfinal[0]).lower() - match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', - '').replace( - 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() - - s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') - s1 = s1g.upper() - s2 = match.upper() - from difflib import SequenceMatcher - print(s1) - print(s2) - print(SequenceMatcher(None, s1, s2).ratio()) - if SequenceMatcher(None, s1, s2).ratio() >= 0.10: - # and SequenceMatcher(None, s1, s2).ratio()<0.50: - final.append( - "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace( - '[', - '').replace( - ']', '').replace( - '.com', '')) - else: - final.append("OrganizationName--" + s2) - - except IndexError: - try: - match = str(urlfinal[0]).lower() - match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', - '').upper() - final.append("OrganizationName--" + match) - # remove_list.append(match) - except IndexError: - company() - - #################################################company Name######################################## - - def company(): - print('company list working') - import re - - new = [] - with open('test.txt', 'r+') as f: - flag = False - for line in f: - line = line.upper() - matches = re.findall( - r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''', - line) - - for i in matches: - if i in line: - flag = True - if flag: - o = "OrganizationName--" + line - new.append(o) - # if line.startswith('\n'): - # flag = False - try: - a = new[0].replace('\n', '') - final.append(a) - except IndexError: - final.append("OrganizationName--") - - # ************************************* CONTACT PERSON ******************************************************************* - def contactpersonname(): - print('contactpersonname working') - try: - final.append( - "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace( - "]", - "") + '/' + - PErsons[ - 1].replace(":PER", "").replace('"', '')) - except IndexError: - try: - final.append( - "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", - "").replace( - '"', '')) - except IndexError: - final.append("CONTACTPERSONNAME--") - - def image_to_text(): - - # doc = DocumentFile.from_images(found) - # result = model(doc) - # image_to_text.txt = result.render() - - # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" - # img = Image.open(found) - # text = tess.image_to_string(img) - # image_to_text.txt = text - # print(text) - import cv2 - img_path = found - img = cv2.imread(img_path) - img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - cv2.imwrite(str(found), img) - - result = ocr.ocr(img_path, cls=True) - result = result[0] - - txts = [line[1][0] for line in result] - - image_to_text.txt = "" - for i in txts: - if len(i) < 4: - continue - # print(i+"\n") - image_to_text.txt = image_to_text.txt + str(i) + "\n" - # print(image_to_text.txt) - - def pdf_to_text(): - - from pdfminer.high_level import extract_text - pdf_to_text.txt = extract_text(found) - # pdf_to_text.txt= text.replace('\n', ' ') - - extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] - - if extension in extensionlist: - print('image' + extension) - image_to_text() - x = image_to_text.txt - - else: - print('pdf' + extension) - pdf_to_text() - x = pdf_to_text.txt - - verticaltext = x - htext = x - # print('------------------------------------------------') - print( - '############################################################# this is verticaltext #################################################################') - print(verticaltext) - htext = htext.replace('\n', ' ') - print( - '############################################################# this is htext #############################################################') - print(htext) - y = x.replace('\n', ',') - y = y.replace(' ', ' ') - # y = y.replace(".", " .") - horizontaltext = y - # print('------------------------------------------------') - print( - '############################################################# this is horizontaltext #############################################################') - print(horizontaltext) - - textfile = open("test123456.txt", "w") - a = textfile.write(verticaltext) - textfile.close() - textfile = open("vtext.txt", "w") - a = textfile.write(horizontaltext) - textfile.close() - with open('test123456.txt', 'r') as f: - with open('test.txt', 'w') as w: - for line in f: - if line.strip().replace('|', ''): - w.write(line) - - ###########################ADDRESS################################## - addrespinlst = [] - - def splitaddress(): - import re - textaddress = htext.replace('\n', ' ') - # print(textaddress) - - address1 = (textaddress.partition(",")[0]) - words = address1.split() - address1 = words[-1] - addre = (htext.partition(",")[2]) - a = addre.replace('\n', ' ').replace('\x0c', '') - addre = (a.partition(",")[2]) - matches = re.findall( - r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b', - a) - for match in matches: - address2 = match - address2 = str(address2) - address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', - '') - - matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) - for address3 in matches: - pass - try: - Address = address1 + "," + address2 + "," + address3 - final.append('ADDRESS--' + Address) - addrespinlst.append(Address) - - except NameError: - - print( - '############################################################ Addressmodelworking #############################################################') - - # doc = nlp_model1(textaddress) - # addlist = [] - # for ent in doc.ents: - # name = (f'{ent.label_.upper():{10}}--{ent.text}') - # addlist.append(name) - # try: - # Address = addlist[0] - # final.append(Address) - # addrespinlst.append(Address) - # remove_list.append( - # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace( - # "ADDRESS--", - # "")) - # except IndexError: - # final.append("ADDRESS--") - pass - - ################################################## website####################################################### - - # import re - - # url = [] - # matches = re.findall(r'www.*', verticaltext) - # for match in matches: - # if (match.count('.')) == 1: - # a_string1 = match.replace("www", "www.") - - # final.append("Urls--" + a_string1) - # url.append(a_string1) - # else: - - # final.append("Urls--" + match) - - # if len(url)==0: - - # from urlextract import URLExtract - - # extractor = URLExtract() - # urls = extractor.find_urls(verticaltext) - # try: - # urllist = urls[0] - # final.append("Urls--"+urllist) - # url.append(urllist) - # except IndexError: - # final.append("Urls--") - - # for match in matches: - # if (match.count('.')) == 1: - # a_string1 = match.replace("www", "www.") - - # final.append("Urls--" + a_string1) - # url.append(a_string1) - # else: - - # final.append("Urls--" + match) - # url.append(match) - # remove_list.append(match) - # else: - # final.append("Urls--" ) - - ################################################## website####################################################### - - import re - # final=[] - url = [] - urlfinal = [] - matches = re.findall(r'www.*', verticaltext) - for match in matches: - - if (match.count('.')) == 1: - a_string1 = match.replace("www", "www.") - - # final.append("Urls--" + a_string1) - url.append(a_string1) - else: - - url.append(match) - - if len(url) == 0: - - from urlextract import URLExtract - - extractor = URLExtract() - urls = extractor.find_urls(verticaltext) - try: - urllist = urls[0] - url.append(urllist) - url.append(urllist) - except IndexError: - pass - - for match in matches: - if (match.count('.')) == 1: - a_string1 = match.replace("www", "www.") - - url.append(a_string1) - # url.append(a_string1) - else: - - url.append(match) - url.append(match) - - else: - pass - try: - test_string = url[0] - - test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"] - - res = [ele for ele in test_list if (ele in test_string)] - - if len(res) == 0: - print('no match') - - final.append('urls--') - - - else: - print('matched') - final.append('urls--' + url[0]) - urlfinal.append(url[0]) - - - except IndexError: - final.append('urls--') - - print( - '############################################################# url #############################################################') - print(url) - #######organisation and contact################ - - # def company_url(): - # # print('--url--') - # # print(url) - - # try: - # match = str(url[0]).lower() - # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper() - # final.append("OrganizationName--" + match) - # # remove_list.append(match) - # except IndexError: - # org_name() - # organisation() - # final.append("OrganizationName--") - - # make example sentence - - # print(horizontaltext) - sentence = Sentence(verticaltext) - - # predict NER tags - tagger.predict(sentence) - - # print sentence - ko = (sentence) - - ko1 = str(ko).split("→") - import pandas as pd - - dfg = [] - try: - s = ko1[1].replace("", "").replace("", "").replace("/", ":") - except IndexError: - os.remove(found) - return 'Invalid image' - dfg.append(s) - df = pd.DataFrame(dfg) - df = df[0] - - df.to_csv("df.csv", index=False) - - df1 = pd.read_csv("df.csv") - ve = df1["0"].str.split(",") - fgf = ve.to_list() - dfgh = pd.DataFrame(fgf[0]) - maindf = dfgh[0] # .str.split(":") - # maindf.to_csv("main.csv") - - main1 = maindf.to_list() - main1 - # cv=pd.DataFrame(ve) - # cv - per = ["PER"] - org = ["ORG"] - loc = ["LOC"] - organizations = [i for i in main1 for j in org if j in i] - PErsons = [i for i in main1 for j in per if j in i] - location = [i for i in main1 for j in loc if j in i] - - # ************************************* ORGANIZATION ******************************************************************** - try: - if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', - '').replace( - ']', '').replace( - '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4: - pass - # company_url() - else: - - match = str(urlfinal[0]).lower() - match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( - 'https', - '').replace( - 'http', '').replace(":", "").replace("/", "").upper() - print(match) - - s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( - '.com', '') + " /" + \ - organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') - s1 = s1g.upper() - s2 = match.upper() - from difflib import SequenceMatcher - print(s1) - print(s2) - print(SequenceMatcher(None, s1, s2).ratio()) - if SequenceMatcher(None, s1, s2).ratio() >= 0.10: - # and SequenceMatcher(None, s1, s2).ratio()<0.50: - final.append( - "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', - '').replace( - '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', - '').replace( - '.com', '').replace(']', '')) - else: - final.append("OrganizationName--" + s2) - - - - except IndexError: - try: - if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', - '').replace( - '"', - '').replace( - '.com', ''))) < 4: - pass - # company_url() - else: - - match = str(urlfinal[0]).lower() - match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( - 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() - - s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', - '').replace( - '.com', '') - s1 = s1g.upper() - s2 = match.upper() - from difflib import SequenceMatcher - print(s1) - print(s2) - print(SequenceMatcher(None, s1, s2).ratio()) - if SequenceMatcher(None, s1, s2).ratio() >= 0.10: - # and SequenceMatcher(None, s1, s2).ratio()<0.50: - final.append( - "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', - '').replace( - ']', '').replace( - '.com', '').replace(']', '')) - else: - final.append("OrganizationName--" + s2) - - except IndexError: - org_name() - organisation() - - # final.append("OrganizationName--") - - # ************************************* CONTACT PERSON ******************************************************************* - try: - final.append( - "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", - "") + - PErsons[ - 1].replace(":PER", "").replace('"', '')) - except IndexError: - try: - final.append( - "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( - '"', - '')) - except IndexError: - org_name() - contactpersonname() - # final.append("CONTACTPERSONNAME--") - ###############address flair##################### - - try: - print( - '############################################################# address new code #############################################################') - loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat'] - loclst = [i for i in loactionlst if i in htext.lower()] - - textaddress = htext - textaddress = textaddress.replace("|", ",") - textaddress = textaddress.lower() - - nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple") - grop = nlp(textaddress) - - citycountry = [] - print('########################### city or country name ###########################') - d = grop[-1] - - if d['entity_group'] == "COUNTRY": - print(d["word"]) - citycountry.append(d["word"]) - elif d['entity_group'] == "CITY": - print(d["word"]) - citycountry.append(d["word"]) - - try: - address1 = loclst[0] - except IndexError: - address1 = (textaddress.partition(",")[0]) - words = address1.split() - address1 = words[-1] - - star_location = address1.lower() - end_location = citycountry[0].replace("#", "") - start = star_location - end = end_location - s = textaddress.lower() - middle_address = (s.split(start))[-1].split(end)[0] - Address = start + middle_address + end - Address = Address.replace('--', '').title() - print(Address) - if Address.count(',') < 2: - splitaddress() - else: - final.append('ADDRESS--' + Address) - - # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '') - # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '') - # d1 = star_location.split() - # d2 = end_location.split() - # d3 = d1[0] - # d4 = d2[0] - # start = d3 - # end = d4 - # s = horizontaltext - # middle_address = ((s.split(start))[1].split(end)[0]) - # Address = d3 + middle_address + d4 - # final.append('ADDRESS--' + Address) - # addrespinlst.append(Address) - - - except IndexError: - splitaddress() - - ########################################## Designation ########################################### - import re - new = [] - with open('test.txt', 'r') as f: - flag = False - for line in f: - line1 = line - line = line.upper() - matches = re.findall( - r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''', - line) - for match in matches: - line = line.replace('-', '') - # print(line) - o = "Designation--" + line - new.append(o) - remove_list.append(str(line1).replace('\n', '')) - - try: - a = new[0].replace('\n', '') - final.append(a) - - except IndexError: - final.append("Designation--") - - ###################################################Phone number################################################# - num = [] - import phonenumbers - - # print(verticaltext) - numbers = phonenumbers.PhoneNumberMatcher( - verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN") - - for number in numbers: - number = str(number).split(")") - num.append(number[1]) - # num.append(number[-1]) - if len(num) == 0: - final.append("ContactNumber--") - final.append("OrganizationNumber--") - elif len(num) > 1: - final.append("ContactNumber--" + num[0].replace(' ', '')) - final.append("OrganizationNumber--" + num[-1].replace(' ', '')) - elif len(num) == 1: - try: - final.append("ContactNumber--" + num[0].replace(' ', '')) - final.append("OrganizationNumber--") - except IndexError: - final.append("ContactNumber--") - final.append("OrganizationNumber--") - print( - '############################################################# num #############################################################') - print(num) - # try: - # final.append("PhoneNumber--" + num[0].replace(' ', '')) - # remove_list.append(num[0]) - # except IndexError: - # pass - # try: - # final.append("PhoneNumber1--" + num[1].replace(' ', '')) - # remove_list.append(num[1]) - # except IndexError: - # pass - # try: - # final.append("PhoneNumber2--" + num[2].replace(' ', '')) - # remove_list.append(num[2]) - # except IndexError: - # pass - - ################################################### Email###################################################### - import re - from email_scraper import scrape_emails - s = list(scrape_emails(horizontaltext)) - email_id = s - - # email_id = [] - # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext) - # for match in matches: - # email_id.append(match) - - # # final.append('Email--' + match) - # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "") - # # final.append(email_) - - # # final.append('Email--' + email_) - # # remove_list.append(email_) - if len(email_id) > 1: - final.append( - 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", - "")) - final.append( - 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( - "'", - "")) - else: - try: - final.append( - 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( - "'", - "")) - final.append('OrganizationEmail--') - except IndexError: - final.append('ContactEmail--') - final.append('OrganizationEmail--') - - ###############PINCODE############ - - pinlst = [] - print(addrespinlst) - import pgeocode - - # try: - # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0]) - # for i in matche1: - # address3 = i.replace(' ', '').replace('-', '') - # pinlst.append(address3) - # except IndexError: - - lst = [] - for i in num: - i = i[1:] - lst.append(i) - - infile = r"vtext.txt" - outfile = r"cleaned_file.txt" - import glob - delete_list = lst - # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development'] - fin = open(infile, "r+") - fout = open(outfile, "w+") - for line12 in fin: - for word in delete_list: - line12 = line12.replace(word, "") - - fout.write(line12) - fin.close() - # print(line) - - # print(addrespinlst) - import pgeocode - print(line12) - import re - matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) - for i in matche1: - address3 = i.replace(' ', '').replace('-', '') - pinlst.append(address3) - - nomi = pgeocode.Nominatim('IN') - try: - a = nomi.query_postal_code(str(pinlst[-1])) - # print(a) - b = a.keys() - c = b.values.tolist() - d = a.tolist() - postal_code = "PinCode1" + "--" + d[0] - final.append(postal_code) - country_code = c[1] + "--" + str(d[1]) - final.append(country_code) - place_name = 'LandMark1' + "--" + str(d[2]) - final.append(place_name) - state_name = c[3] + "--" + str(d[3]) - final.append(state_name) - state_code = c[4] + "--" + str(d[4]) - final.append(state_code) - county_name = 'CityName1' + "--" + str(d[5]) - final.append(county_name) - - except (IndexError, NameError): - final.append("PinCode1--") - final.append("country_code--") - final.append("LandMark1--") - final.append("state_name--") - final.append("state_code--") - final.append("CityName1--") - - ######################################################## json ##################################################################### - - import pandas as pd - df = pd.DataFrame(final) - df1 = df[0].str.split('--', expand=True) - # print(df1) - df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True) - df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True) - df1['Keys'] = df1['Keys'].str.strip() - df1.to_csv('path123.csv', index=False) - df2 = pd.read_csv('path123.csv') - print(df2) - df2 = df2.T - df2.to_csv('path1.csv', index=False, header=False) - df1 = pd.read_csv('path1.csv') - df1.to_json('firstjson1.json', orient="index") - import json - with open('firstjson1.json', 'r') as json_file: - json_load = json.load(json_file) - # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" - nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') - # # print('--------------------------------------------------------------------------') - # # print(nothing) - empty = [] - import base64 - name = found - image = open(name, 'rb') - image_read = image.read() - image_64_encode = base64.b64encode(image_read) - NULL = 'null' - empty.append("ByteData--" + (NULL).strip('""')) - image_64_encode = image_64_encode.decode('utf-8') - empty.append("FileData--" + str(image_64_encode)) - imagedata = name.split("/") - imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") - imagename1 = str(imagename).split('.') - imagename = str(imagename1[-2]).replace("[", "]") - empty.append("FileName--" + imagename) - empty.append("FilePath--"+ "") - imageExtension = str(imagename1[-1]).replace("[", "]") - empty.append("FileType--" + imageExtension) - image.close() - import pandas as pd - df = pd.DataFrame(empty) - df = df[0].str.split("--", expand=True) - data1 = pd.DataFrame(df[0]) - data2 = pd.DataFrame(df[1]) - dt = data2.set_index(data1[0]) - dt4 = dt.T - dictionary = dt4.to_dict(orient="index") - list1 = [] - # list.append(a) - list1.append(dictionary[1]) - # # final.append("image--"+str(dictionary[1]).replace("\'",'"')) - print('--------------------') - # print(namelist) - import json - # JSON data: - x = nothing - # python object to be appended - y = {"image": dictionary[1]} - # parsing JSON string: - z = json.loads(x) - # appending the data - z.update(y) - # the result is a JSON string: - # print(json.dumps(z)) - zlist=[] - zlist.append(z) - #############################################creating csv##################################### - print(final) - print(imagelist) - final.append('image--' + str(imagelist)) - import requests - import json - url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev - # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" #testing - # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test - # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create' - # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 - payload1 = json.dumps(zlist) - # print('--------------------------------------------------------------------------') - #print(payload1) - headers = { - #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev - 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing - # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1', - # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 - # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo - 'Content-Type': 'application/json' - } - response = requests.request("POST", url, headers=headers, data=payload1) - # print("##############################################################") - - #print(payload1) - print(response.text) - import os - if 'BusinessCards Created Successfully' in response.text: - print('present') - os.remove(found) - else: - print('not present') - - df1.to_json('visitingcard.json') - data = df1.to_json('visiting.json', orient='records') - print(data) - - #return render_template('index.html') - - - return response.text - # return 'done' - - -if __name__ == "__main__": +from flask import Flask, render_template, request, redirect, Response, send_file +import os +import openai +import requests +import pandas as pd +import pgeocode +from email_scraper import scrape_emails +import phonenumbers +from pdfminer.high_level import extract_text +import pytesseract +import time +import multiprocessing +from PIL import Image +from functools import partial +from urlextract import URLExtract +import pytesseract as tess +from PIL import Image +import os +import glob + +from pytesseract import * +import shutil +import cv2 +import matplotlib +from werkzeug.utils import secure_filename +import requests +import spacy +import time +import multiprocessing +from PIL import Image +from functools import partial + +import pandas as pd +################################################################ +Current_Working_Directory=os.getcwd() +Current_Working_Directory=Current_Working_Directory.replace("\\","/") +nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p") + +################################################################ +# import spacy + +# nlp_model1 = spacy.load('./ADD3001.2') +from flair.data import Sentence +from flair.models import SequenceTagger +from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline + +tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner") +model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner") + +from paddleocr import PaddleOCR, draw_ocr + +ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True) +tagger = SequenceTagger.load("flair/ner-english-large") + +import datetime + +app = Flask(__name__) + + +# app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/" + +@app.route('/', methods=['GET']) +def card(): + return render_template('card.html') + + + +@app.route('/upload_BusinessCards', methods=["POST"]) +# @app.route('/multiplecards', methods=["POST"]) +def multiplecards(): + # print('################## multiple card detection #######################') + # print(Dataset) + datalist=[] + Dataset = request.get_json() + # print(data) + #datalist.append(Dataset) + data = {'visiting': Dataset} + for i in data['visiting']: + import time + # time.sleep(1) + a = i + x = a['FileData'] + # print(x) + y = a['FileName'] + z = a['FileType'] + # CreatedBy=a['CreatedBy'] + + name = y + '.' + z + # print(name) + # print(y) + # image = y.split("/") + # filename=image[-1] + + # print(x) + img_data = x.encode() + + import base64 + with open('./multicards/' + name, "wb") as fh: + fh.write(base64.decodebytes(img_data)) + # print(i) + + # import os + # import glob + # for i in glob.glob('./multipleupload/*'): + + found = './multicards/' + name + print(found) + extension = found.split('.')[-1] + + # for root, dirs, fils in os.glob('./multipleupload'): + # for name in files: + # foundfile= os.path.join(root, name) + # print(foundfile) + + import re + import csv + import glob + import os + # import pytesseract + # import cv2 + import numpy as np + import glob + import os + import cv2 + import requests + final = [] + # final.append('assignto--'+CreatedBy) + imagelist = [] + # print(found) + remove_list = [] + import os + import glob + import pdfminer + + # import os + # ts = 0 + # for file_name in glob.glob('./upload/*'): + # fts = os.path.getmtime(file_name) + # if fts > ts: + # ts = fts + # found = file_name + # print(found) + + # print(extension) + + def org_name(): + print('org_name is working') + import pytesseract + fname = found + if extension != 'pdf': + + img = cv2.imread(fname) + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + cv2.imwrite(str(found), img) + from PIL import Image + im = Image.open(found) + im.save("images1.png", dpi=(1200, 1200)) + # import pytesseract + fname = "images1.png" + import pytesseract as tess + from PIL import Image + + tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" + pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf") + with open("demo.pdf", "w+b", ) as f: + f.write(pdf) + + from pdfminer.high_level import extract_text + text = extract_text('demo.pdf') + # doc = DocumentFile.from_images(found) + # result = model(doc) + # text = result.render() + + # from pdfminer.high_level import extract_text + # txt = extract_text('demo.pdf') + else: + from pdfminer.high_level import extract_text + text = extract_text(fname) + + sentence = Sentence(text) + + # predict NER tags + tagger.predict(sentence) + + # print sentence + ko = (sentence) + + ko1 = str(ko).split("→") + import pandas as pd + + dfg = [] + try: + s = ko1[1].replace("", "").replace("", "").replace("/", ":") + + # os.remove(found) + # return 'Invalid image' + dfg.append(s) + df = pd.DataFrame(dfg) + df = df[0] + + df.to_csv("df.csv", index=False) + + df1 = pd.read_csv("df.csv") + ve = df1["0"].str.split(",") + fgf = ve.to_list() + dfgh = pd.DataFrame(fgf[0]) + maindf = dfgh[0] # .str.split(":") + # maindf.to_csv("main.csv") + + main1 = maindf.to_list() + main1 + # cv=pd.DataFrame(ve) + # cv + per = ["PER"] + org = ["ORG"] + loc = ["LOC"] + organizations = [i for i in main1 for j in org if j in i] + PErsons = [i for i in main1 for j in per if j in i] + location = [i for i in main1 for j in loc if j in i] + except IndexError: + pass + + # ************************************* ORGANIZATION ******************************************************************** + + def organisation(): + print('organisation working ') + try: + if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', + '').replace( + '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"', + '').replace( + '.com', ''))) < 4: + pass + + + else: + + match = str(urlfinal[0]).lower() + match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( + 'https', + '').replace( + 'http', '').replace(":", "").replace("/", "").upper() + print(match) + + s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', + '') + " /" + \ + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') + s1 = s1g.upper() + s2 = match.upper() + from difflib import SequenceMatcher + print(s1) + print(s2) + print(SequenceMatcher(None, s1, s2).ratio()) + if SequenceMatcher(None, s1, s2).ratio() >= 0.10: + # and SequenceMatcher(None, s1, s2).ratio()<0.50: + final.append( + "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', + '').replace( + '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', + '').replace( + '.com', + '').replace(']', '')) + else: + final.append("OrganizationName--" + s2) + + except IndexError: + try: + if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', + '').replace( + '"', + '').replace( + '.com', '').replace('.in', ''))) < 4: + pass + + else: + match = str(urlfinal[0]).lower() + match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', + '').replace( + 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() + + s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') + s1 = s1g.upper() + s2 = match.upper() + from difflib import SequenceMatcher + print(s1) + print(s2) + print(SequenceMatcher(None, s1, s2).ratio()) + if SequenceMatcher(None, s1, s2).ratio() >= 0.10: + # and SequenceMatcher(None, s1, s2).ratio()<0.50: + final.append( + "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace( + '[', + '').replace( + ']', '').replace( + '.com', '')) + else: + final.append("OrganizationName--" + s2) + + except IndexError: + try: + match = str(urlfinal[0]).lower() + match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', + '').upper() + final.append("OrganizationName--" + match) + # remove_list.append(match) + except IndexError: + company() + + #################################################company Name######################################## + + def company(): + print('company list working') + import re + + new = [] + with open('test.txt', 'r+') as f: + flag = False + for line in f: + line = line.upper() + matches = re.findall( + r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''', + line) + + for i in matches: + if i in line: + flag = True + if flag: + o = "OrganizationName--" + line + new.append(o) + # if line.startswith('\n'): + # flag = False + try: + a = new[0].replace('\n', '') + final.append(a) + except IndexError: + final.append("OrganizationName--") + + # ************************************* CONTACT PERSON ******************************************************************* + def contactpersonname(): + print('contactpersonname working') + try: + final.append( + "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace( + "]", + "") + '/' + + PErsons[ + 1].replace(":PER", "").replace('"', '')) + except IndexError: + try: + final.append( + "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", + "").replace( + '"', '')) + except IndexError: + final.append("CONTACTPERSONNAME--") + + def image_to_text(): + + # doc = DocumentFile.from_images(found) + # result = model(doc) + # image_to_text.txt = result.render() + + # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" + # img = Image.open(found) + # text = tess.image_to_string(img) + # image_to_text.txt = text + # print(text) + import cv2 + img_path = found + img = cv2.imread(img_path) + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + cv2.imwrite(str(found), img) + + result = ocr.ocr(img_path, cls=True) + result = result[0] + + txts = [line[1][0] for line in result] + + image_to_text.txt = "" + for i in txts: + if len(i) < 4: + continue + # print(i+"\n") + image_to_text.txt = image_to_text.txt + str(i) + "\n" + # print(image_to_text.txt) + + def pdf_to_text(): + + from pdfminer.high_level import extract_text + pdf_to_text.txt = extract_text(found) + # pdf_to_text.txt= text.replace('\n', ' ') + + extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] + + if extension in extensionlist: + print('image' + extension) + image_to_text() + x = image_to_text.txt + + else: + print('pdf' + extension) + pdf_to_text() + x = pdf_to_text.txt + + verticaltext = x + htext = x + # print('------------------------------------------------') + print( + '############################################################# this is verticaltext #################################################################') + print(verticaltext) + htext = htext.replace('\n', ' ') + print( + '############################################################# this is htext #############################################################') + print(htext) + y = x.replace('\n', ',') + y = y.replace(' ', ' ') + # y = y.replace(".", " .") + horizontaltext = y + # print('------------------------------------------------') + print( + '############################################################# this is horizontaltext #############################################################') + print(horizontaltext) + + textfile = open("test123456.txt", "w") + a = textfile.write(verticaltext) + textfile.close() + textfile = open("vtext.txt", "w") + a = textfile.write(horizontaltext) + textfile.close() + with open('test123456.txt', 'r') as f: + with open('test.txt', 'w') as w: + for line in f: + if line.strip().replace('|', ''): + w.write(line) + + ###########################ADDRESS################################## + addrespinlst = [] + + def splitaddress(): + import re + textaddress = htext.replace('\n', ' ') + # print(textaddress) + + address1 = (textaddress.partition(",")[0]) + words = address1.split() + address1 = words[-1] + addre = (htext.partition(",")[2]) + a = addre.replace('\n', ' ').replace('\x0c', '') + addre = (a.partition(",")[2]) + matches = re.findall( + r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b', + a) + for match in matches: + address2 = match + address2 = str(address2) + address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', + '') + + matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) + for address3 in matches: + pass + try: + Address = address1 + "," + address2 + "," + address3 + final.append('ADDRESS--' + Address) + addrespinlst.append(Address) + + except NameError: + + print( + '############################################################ Addressmodelworking #############################################################') + + # doc = nlp_model1(textaddress) + # addlist = [] + # for ent in doc.ents: + # name = (f'{ent.label_.upper():{10}}--{ent.text}') + # addlist.append(name) + # try: + # Address = addlist[0] + # final.append(Address) + # addrespinlst.append(Address) + # remove_list.append( + # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace( + # "ADDRESS--", + # "")) + # except IndexError: + # final.append("ADDRESS--") + pass + + ################################################## website####################################################### + + # import re + + # url = [] + # matches = re.findall(r'www.*', verticaltext) + # for match in matches: + # if (match.count('.')) == 1: + # a_string1 = match.replace("www", "www.") + + # final.append("Urls--" + a_string1) + # url.append(a_string1) + # else: + + # final.append("Urls--" + match) + + # if len(url)==0: + + # from urlextract import URLExtract + + # extractor = URLExtract() + # urls = extractor.find_urls(verticaltext) + # try: + # urllist = urls[0] + # final.append("Urls--"+urllist) + # url.append(urllist) + # except IndexError: + # final.append("Urls--") + + # for match in matches: + # if (match.count('.')) == 1: + # a_string1 = match.replace("www", "www.") + + # final.append("Urls--" + a_string1) + # url.append(a_string1) + # else: + + # final.append("Urls--" + match) + # url.append(match) + # remove_list.append(match) + # else: + # final.append("Urls--" ) + + ################################################## website####################################################### + + import re + # final=[] + url = [] + urlfinal = [] + matches = re.findall(r'www.*', verticaltext) + for match in matches: + + if (match.count('.')) == 1: + a_string1 = match.replace("www", "www.") + + # final.append("Urls--" + a_string1) + url.append(a_string1) + else: + + url.append(match) + + if len(url) == 0: + + from urlextract import URLExtract + + extractor = URLExtract() + urls = extractor.find_urls(verticaltext) + try: + urllist = urls[0] + url.append(urllist) + url.append(urllist) + except IndexError: + pass + + for match in matches: + if (match.count('.')) == 1: + a_string1 = match.replace("www", "www.") + + url.append(a_string1) + # url.append(a_string1) + else: + + url.append(match) + url.append(match) + + else: + pass + try: + test_string = url[0] + + test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"] + + res = [ele for ele in test_list if (ele in test_string)] + + if len(res) == 0: + print('no match') + + final.append('urls--') + + + else: + print('matched') + final.append('urls--' + url[0]) + urlfinal.append(url[0]) + + + except IndexError: + final.append('urls--') + + print( + '############################################################# url #############################################################') + print(url) + #######organisation and contact################ + + # def company_url(): + # # print('--url--') + # # print(url) + + # try: + # match = str(url[0]).lower() + # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper() + # final.append("OrganizationName--" + match) + # # remove_list.append(match) + # except IndexError: + # org_name() + # organisation() + # final.append("OrganizationName--") + + # make example sentence + + # print(horizontaltext) + sentence = Sentence(verticaltext) + + # predict NER tags + tagger.predict(sentence) + + # print sentence + ko = (sentence) + + ko1 = str(ko).split("→") + import pandas as pd + + dfg = [] + try: + s = ko1[1].replace("", "").replace("", "").replace("/", ":") + except IndexError: + os.remove(found) + return 'Invalid image' + dfg.append(s) + df = pd.DataFrame(dfg) + df = df[0] + + df.to_csv("df.csv", index=False) + + df1 = pd.read_csv("df.csv") + ve = df1["0"].str.split(",") + fgf = ve.to_list() + dfgh = pd.DataFrame(fgf[0]) + maindf = dfgh[0] # .str.split(":") + # maindf.to_csv("main.csv") + + main1 = maindf.to_list() + main1 + # cv=pd.DataFrame(ve) + # cv + per = ["PER"] + org = ["ORG"] + loc = ["LOC"] + organizations = [i for i in main1 for j in org if j in i] + PErsons = [i for i in main1 for j in per if j in i] + location = [i for i in main1 for j in loc if j in i] + + # ************************************* ORGANIZATION ******************************************************************** + try: + if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', + '').replace( + ']', '').replace( + '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4: + pass + # company_url() + else: + + match = str(urlfinal[0]).lower() + match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( + 'https', + '').replace( + 'http', '').replace(":", "").replace("/", "").upper() + print(match) + + s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( + '.com', '') + " /" + \ + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') + s1 = s1g.upper() + s2 = match.upper() + from difflib import SequenceMatcher + print(s1) + print(s2) + print(SequenceMatcher(None, s1, s2).ratio()) + if SequenceMatcher(None, s1, s2).ratio() >= 0.10: + # and SequenceMatcher(None, s1, s2).ratio()<0.50: + final.append( + "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', + '').replace( + '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', + '').replace( + '.com', '').replace(']', '')) + else: + final.append("OrganizationName--" + s2) + + + + except IndexError: + try: + if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', + '').replace( + '"', + '').replace( + '.com', ''))) < 4: + pass + # company_url() + else: + + match = str(urlfinal[0]).lower() + match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( + 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() + + s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', + '').replace( + '.com', '') + s1 = s1g.upper() + s2 = match.upper() + from difflib import SequenceMatcher + print(s1) + print(s2) + print(SequenceMatcher(None, s1, s2).ratio()) + if SequenceMatcher(None, s1, s2).ratio() >= 0.10: + # and SequenceMatcher(None, s1, s2).ratio()<0.50: + final.append( + "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', + '').replace( + ']', '').replace( + '.com', '').replace(']', '')) + else: + final.append("OrganizationName--" + s2) + + except IndexError: + org_name() + organisation() + + # final.append("OrganizationName--") + + # ************************************* CONTACT PERSON ******************************************************************* + try: + final.append( + "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", + "") + + PErsons[ + 1].replace(":PER", "").replace('"', '')) + except IndexError: + try: + final.append( + "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( + '"', + '')) + except IndexError: + org_name() + contactpersonname() + # final.append("CONTACTPERSONNAME--") + ###############address flair##################### + + try: + print( + '############################################################# address new code #############################################################') + loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat'] + loclst = [i for i in loactionlst if i in htext.lower()] + + textaddress = htext + textaddress = textaddress.replace("|", ",") + textaddress = textaddress.lower() + + nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple") + grop = nlp(textaddress) + + citycountry = [] + print('########################### city or country name ###########################') + d = grop[-1] + + if d['entity_group'] == "COUNTRY": + print(d["word"]) + citycountry.append(d["word"]) + elif d['entity_group'] == "CITY": + print(d["word"]) + citycountry.append(d["word"]) + + try: + address1 = loclst[0] + except IndexError: + address1 = (textaddress.partition(",")[0]) + words = address1.split() + address1 = words[-1] + + star_location = address1.lower() + end_location = citycountry[0].replace("#", "") + start = star_location + end = end_location + s = textaddress.lower() + middle_address = (s.split(start))[-1].split(end)[0] + Address = start + middle_address + end + Address = Address.replace('--', '').title() + print(Address) + if Address.count(',') < 2: + splitaddress() + else: + final.append('ADDRESS--' + Address) + + # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '') + # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '') + # d1 = star_location.split() + # d2 = end_location.split() + # d3 = d1[0] + # d4 = d2[0] + # start = d3 + # end = d4 + # s = horizontaltext + # middle_address = ((s.split(start))[1].split(end)[0]) + # Address = d3 + middle_address + d4 + # final.append('ADDRESS--' + Address) + # addrespinlst.append(Address) + + + except IndexError: + splitaddress() + + ########################################## Designation ########################################### + import re + new = [] + with open('test.txt', 'r') as f: + flag = False + for line in f: + line1 = line + line = line.upper() + matches = re.findall( + r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''', + line) + for match in matches: + line = line.replace('-', '') + # print(line) + o = "Designation--" + line + new.append(o) + remove_list.append(str(line1).replace('\n', '')) + + try: + a = new[0].replace('\n', '') + final.append(a) + + except IndexError: + final.append("Designation--") + + ###################################################Phone number################################################# + num = [] + import phonenumbers + + # print(verticaltext) + numbers = phonenumbers.PhoneNumberMatcher( + verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN") + + for number in numbers: + number = str(number).split(")") + num.append(number[1]) + # num.append(number[-1]) + if len(num) == 0: + final.append("ContactNumber--") + final.append("OrganizationNumber--") + elif len(num) > 1: + final.append("ContactNumber--" + num[0].replace(' ', '')) + final.append("OrganizationNumber--" + num[-1].replace(' ', '')) + elif len(num) == 1: + try: + final.append("ContactNumber--" + num[0].replace(' ', '')) + final.append("OrganizationNumber--") + except IndexError: + final.append("ContactNumber--") + final.append("OrganizationNumber--") + print( + '############################################################# num #############################################################') + print(num) + # try: + # final.append("PhoneNumber--" + num[0].replace(' ', '')) + # remove_list.append(num[0]) + # except IndexError: + # pass + # try: + # final.append("PhoneNumber1--" + num[1].replace(' ', '')) + # remove_list.append(num[1]) + # except IndexError: + # pass + # try: + # final.append("PhoneNumber2--" + num[2].replace(' ', '')) + # remove_list.append(num[2]) + # except IndexError: + # pass + + ################################################### Email###################################################### + import re + from email_scraper import scrape_emails + s = list(scrape_emails(horizontaltext)) + email_id = s + + # email_id = [] + # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext) + # for match in matches: + # email_id.append(match) + + # # final.append('Email--' + match) + # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "") + # # final.append(email_) + + # # final.append('Email--' + email_) + # # remove_list.append(email_) + if len(email_id) > 1: + final.append( + 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", + "")) + final.append( + 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( + "'", + "")) + else: + try: + final.append( + 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( + "'", + "")) + final.append('OrganizationEmail--') + except IndexError: + final.append('ContactEmail--') + final.append('OrganizationEmail--') + + ###############PINCODE############ + + pinlst = [] + print(addrespinlst) + import pgeocode + + # try: + # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0]) + # for i in matche1: + # address3 = i.replace(' ', '').replace('-', '') + # pinlst.append(address3) + # except IndexError: + + lst = [] + for i in num: + i = i[1:] + lst.append(i) + + infile = r"vtext.txt" + outfile = r"cleaned_file.txt" + import glob + delete_list = lst + # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development'] + fin = open(infile, "r+") + fout = open(outfile, "w+") + for line12 in fin: + for word in delete_list: + line12 = line12.replace(word, "") + + fout.write(line12) + fin.close() + # print(line) + + # print(addrespinlst) + import pgeocode + print(line12) + import re + matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) + for i in matche1: + address3 = i.replace(' ', '').replace('-', '') + pinlst.append(address3) + + nomi = pgeocode.Nominatim('IN') + try: + a = nomi.query_postal_code(str(pinlst[-1])) + # print(a) + b = a.keys() + c = b.values.tolist() + d = a.tolist() + postal_code = "PinCode1" + "--" + d[0] + final.append(postal_code) + country_code = c[1] + "--" + str(d[1]) + final.append(country_code) + place_name = 'LandMark1' + "--" + str(d[2]) + final.append(place_name) + state_name = c[3] + "--" + str(d[3]) + final.append(state_name) + state_code = c[4] + "--" + str(d[4]) + final.append(state_code) + county_name = 'CityName1' + "--" + str(d[5]) + final.append(county_name) + + except (IndexError, NameError): + final.append("PinCode1--") + final.append("country_code--") + final.append("LandMark1--") + final.append("state_name--") + final.append("state_code--") + final.append("CityName1--") + + ######################################################## json ##################################################################### + + import pandas as pd + df = pd.DataFrame(final) + df1 = df[0].str.split('--', expand=True) + # print(df1) + df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True) + df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True) + df1['Keys'] = df1['Keys'].str.strip() + df1.to_csv('path123.csv', index=False) + df2 = pd.read_csv('path123.csv') + print(df2) + df2 = df2.T + df2.to_csv('path1.csv', index=False, header=False) + df1 = pd.read_csv('path1.csv') + df1.to_json('firstjson1.json', orient="index") + import json + with open('firstjson1.json', 'r') as json_file: + json_load = json.load(json_file) + # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" + nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') + # # print('--------------------------------------------------------------------------') + # # print(nothing) + empty = [] + import base64 + name = found + image = open(name, 'rb') + image_read = image.read() + image_64_encode = base64.b64encode(image_read) + NULL = 'null' + empty.append("ByteData--" + (NULL).strip('""')) + image_64_encode = image_64_encode.decode('utf-8') + empty.append("FileData--" + str(image_64_encode)) + imagedata = name.split("/") + imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") + imagename1 = str(imagename).split('.') + imagename = str(imagename1[-2]).replace("[", "]") + empty.append("FileName--" + imagename) + empty.append("FilePath--"+ "") + imageExtension = str(imagename1[-1]).replace("[", "]") + empty.append("FileType--" + imageExtension) + image.close() + import pandas as pd + df = pd.DataFrame(empty) + df = df[0].str.split("--", expand=True) + data1 = pd.DataFrame(df[0]) + data2 = pd.DataFrame(df[1]) + dt = data2.set_index(data1[0]) + dt4 = dt.T + dictionary = dt4.to_dict(orient="index") + list1 = [] + # list.append(a) + list1.append(dictionary[1]) + # # final.append("image--"+str(dictionary[1]).replace("\'",'"')) + print('--------------------') + # print(namelist) + import json + # JSON data: + x = nothing + # python object to be appended + y = {"image": dictionary[1]} + # parsing JSON string: + z = json.loads(x) + # appending the data + z.update(y) + # the result is a JSON string: + # print(json.dumps(z)) + zlist=[] + zlist.append(z) + #############################################creating csv##################################### + print(final) + print(imagelist) + final.append('image--' + str(imagelist)) + import requests + import json + url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev + # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" #testing + # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test + # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create' + # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 + payload1 = json.dumps(zlist) + # print('--------------------------------------------------------------------------') + #print(payload1) + headers = { + #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev + 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing + # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1', + # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 + # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo + 'Content-Type': 'application/json' + } + response = requests.request("POST", url, headers=headers, data=payload1) + # print("##############################################################") + + #print(payload1) + print(response.text) + import os + if 'BusinessCards Created Successfully' in response.text: + print('present') + os.remove(found) + else: + print('not present') + + df1.to_json('visitingcard.json') + data = df1.to_json('visiting.json', orient='records') + print(data) + + #return render_template('index.html') + + + return response.text + # return 'done' + + +if __name__ == "__main__": app.run(host='0.0.0.0', port=1112) \ No newline at end of file