123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119 |
- from flask import Flask, render_template, request, redirect, Response, send_file
- import os
- # import openai
- import requests
- import pandas as pd
- import pgeocode
- from email_scraper import scrape_emails
- import phonenumbers
- from pdfminer.high_level import extract_text
- import pytesseract
- import time
- import multiprocessing
- from PIL import Image
- from functools import partial
- from urlextract import URLExtract
- import pytesseract as tess
- from PIL import Image
- import os
- import glob
-
- from pytesseract import *
- import shutil
- import cv2
- import matplotlib
- from werkzeug.utils import secure_filename
- import requests
- #import spacy
- import time
- import multiprocessing
- from PIL import Image
- from functools import partial
-
- import pandas as pd
- ################################################################
- Current_Working_Directory=os.getcwd()
- Current_Working_Directory=Current_Working_Directory.replace("\\","/")
- # nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")
-
- ################################################################
- # import spacy
-
- # nlp_model1 = spacy.load('./ADD3001.2')
- from flair.data import Sentence
- from flair.models import SequenceTagger
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-
- tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
- model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
-
- from paddleocr import PaddleOCR, draw_ocr
-
- ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True)
- tagger = SequenceTagger.load("flair/ner-english-large")
-
- import datetime
-
- app = Flask(__name__)
-
-
- # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
-
- @app.route('/', methods=['GET'])
- def card():
- return render_template('card.html')
-
-
- @app.route('/upload_BusinessCards', methods=["POST"])
- # @app.route('/multiplecards', methods=["POST"])
- def multiplecards():
- # print('################## multiple card detection #######################')
- # print(Dataset)
- from pathlib import Path
- Path("multicards").mkdir(exist_ok=True)
- datalist=[]
- zlist=[]
- Dataset = request.get_json()
- # print(data)
- #datalist.append(Dataset)
- data = {'visiting': Dataset}
- for i in data['visiting']:
- import time
- # time.sleep(1)
- a = i
- x = a['FileData']
- # print(x)
- y = a['FileName']
- z = a['FileType']
- # CreatedBy=a['CreatedBy']
-
- name = y + '.' + z
- # print(name)
- # print(y)
- # image = y.split("/")
- # filename=image[-1]
-
- # print(x)
- img_data = x.encode()
-
- import base64
- with open('./multicards/' + name, "wb") as fh:
- fh.write(base64.decodebytes(img_data))
- # print(i)
-
- # import os
- # import glob
- # for i in glob.glob('./multipleupload/*'):
-
- found = './multicards/' + name
- print(found)
- extension = found.split('.')[-1]
-
- # for root, dirs, fils in os.glob('./multipleupload'):
- # for name in files:
- # foundfile= os.path.join(root, name)
- # print(foundfile)
-
- import re
- import csv
- import glob
- import os
- # import pytesseract
- # import cv2
- import numpy as np
- import glob
- import os
- import cv2
- import requests
- final = []
- # final.append('assignto--'+CreatedBy)
- imagelist = []
- # print(found)
- remove_list = []
- import os
- import glob
- import pdfminer
-
- # import os
- # ts = 0
- # for file_name in glob.glob('./upload/*'):
- # fts = os.path.getmtime(file_name)
- # if fts > ts:
- # ts = fts
- # found = file_name
- # print(found)
-
- # print(extension)
-
- def org_name():
- print('org_name is working')
- import pytesseract
- fname = found
- if extension != 'pdf':
-
- img = cv2.imread(fname)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- cv2.imwrite(str(found), img)
- from PIL import Image
- im = Image.open(found)
- im.save("images1.png", dpi=(1200, 1200))
- # import pytesseract
- fname = "images1.png"
- import pytesseract as tess
- from PIL import Image
-
- tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
- pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
- with open("demo.pdf", "w+b", ) as f:
- f.write(pdf)
-
- from pdfminer.high_level import extract_text
- text = extract_text('demo.pdf')
- # doc = DocumentFile.from_images(found)
- # result = model(doc)
- # text = result.render()
-
- # from pdfminer.high_level import extract_text
- # txt = extract_text('demo.pdf')
- else:
- from pdfminer.high_level import extract_text
- text = extract_text(fname)
-
- sentence = Sentence(text)
-
- # predict NER tags
- tagger.predict(sentence)
-
- # print sentence
- ko = (sentence)
-
- ko1 = str(ko).split("→")
- import pandas as pd
-
- dfg = []
- try:
- s = ko1[1].replace("", "").replace("", "").replace("/", ":")
-
- # os.remove(found)
- # return 'Invalid image'
- dfg.append(s)
- df = pd.DataFrame(dfg)
- df = df[0]
-
- df.to_csv("df.csv", index=False)
-
- df1 = pd.read_csv("df.csv")
- ve = df1["0"].str.split(",")
- fgf = ve.to_list()
- dfgh = pd.DataFrame(fgf[0])
- maindf = dfgh[0] # .str.split(":")
- # maindf.to_csv("main.csv")
-
- main1 = maindf.to_list()
- main1
- # cv=pd.DataFrame(ve)
- # cv
- per = ["PER"]
- org = ["ORG"]
- loc = ["LOC"]
- organizations = [i for i in main1 for j in org if j in i]
- PErsons = [i for i in main1 for j in per if j in i]
- location = [i for i in main1 for j in loc if j in i]
- except IndexError:
- pass
-
- # ************************************* ORGANIZATION ********************************************************************
-
- def organisation():
- print('organisation working ')
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com', ''))) < 4:
- pass
-
-
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https',
- '').replace(
- 'http', '').replace(":", "").replace("/", "").upper()
- print(match)
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
- '') + " /" + \
- organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com',
- '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
- '').replace(
- '"',
- '').replace(
- '.com', '').replace('.in', ''))) < 4:
- pass
-
- else:
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
- '').replace(
- 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
- '[',
- '').replace(
- ']', '').replace(
- '.com', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- try:
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
- '').upper()
- final.append("OrganizationName--" + match)
- # remove_list.append(match)
- except IndexError:
- company()
-
- #################################################company Name########################################
-
- def company():
- print('company list working')
- import re
-
- new = []
- with open('test.txt', 'r+') as f:
- flag = False
- for line in f:
- line = line.upper()
- matches = re.findall(
- r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
- line)
-
- for i in matches:
- if i in line:
- flag = True
- if flag:
- o = "OrganizationName--" + line
- new.append(o)
- # if line.startswith('\n'):
- # flag = False
- try:
- a = new[0].replace('\n', '')
- final.append(a)
- except IndexError:
- final.append("OrganizationName--")
-
- # ************************************* CONTACT PERSON *******************************************************************
- def contactpersonname():
- print('contactpersonname working')
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
- "]",
- "") + '/' +
- PErsons[
- 1].replace(":PER", "").replace('"', ''))
- except IndexError:
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
- "").replace(
- '"', ''))
- except IndexError:
- final.append("CONTACTPERSONNAME--")
-
- def image_to_text():
-
- # doc = DocumentFile.from_images(found)
- # result = model(doc)
- # image_to_text.txt = result.render()
-
- # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
- # img = Image.open(found)
- # text = tess.image_to_string(img)
- # image_to_text.txt = text
- # print(text)
- import cv2
- img_path = found
- img = cv2.imread(img_path)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- cv2.imwrite(str(found), img)
-
- result = ocr.ocr(img_path, cls=True)
- result = result[0]
-
- txts = [line[1][0] for line in result]
-
- image_to_text.txt = ""
- for i in txts:
- if len(i) < 4:
- continue
- # print(i+"\n")
- image_to_text.txt = image_to_text.txt + str(i) + "\n"
- # print(image_to_text.txt)
-
- def pdf_to_text():
-
- from pdfminer.high_level import extract_text
- pdf_to_text.txt = extract_text(found)
- # pdf_to_text.txt= text.replace('\n', ' ')
-
- extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
-
- if extension in extensionlist:
- print('image' + extension)
- image_to_text()
- x = image_to_text.txt
-
- else:
- print('pdf' + extension)
- pdf_to_text()
- x = pdf_to_text.txt
-
- verticaltext = x
- htext = x
- # print('------------------------------------------------')
- #print('############################################################# this is verticaltext #################################################################')
- # print(verticaltext)
- htext = htext.replace('\n', ' ')
- # print('############################################################# this is htext #############################################################')
- #print(htext)
- y = x.replace('\n', ',')
- y = y.replace(' ', ' ')
- # y = y.replace(".", " .")
- horizontaltext = y
- # print('------------------------------------------------')
- #print('############################################################# this is horizontaltext #############################################################')
- #print(horizontaltext)
-
- textfile = open("test123456.txt", "w")
- a = textfile.write(verticaltext)
- textfile.close()
- textfile = open("vtext.txt", "w")
- a = textfile.write(horizontaltext)
- textfile.close()
- with open('test123456.txt', 'r') as f:
- with open('test.txt', 'w') as w:
- for line in f:
- if line.strip().replace('|', ''):
- w.write(line)
-
- ###########################ADDRESS##################################
- addrespinlst = []
-
- def splitaddress():
- import re
- textaddress = htext.replace('\n', ' ')
- # print(textaddress)
-
- address1 = (textaddress.partition(",")[0])
- words = address1.split()
- address1 = words[-1]
- addre = (htext.partition(",")[2])
- a = addre.replace('\n', ' ').replace('\x0c', '')
- addre = (a.partition(",")[2])
- matches = re.findall(
- r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
- a)
- for match in matches:
- address2 = match
- address2 = str(address2)
- address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
- '')
-
- matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
- for address3 in matches:
- pass
- try:
- Address = address1 + "," + address2 + "," + address3
- final.append('ADDRESS--' + Address)
- addrespinlst.append(Address)
-
- except NameError:
-
- print(
- '############################################################ Addressmodelworking #############################################################')
-
- # doc = nlp_model1(textaddress)
- # addlist = []
- # for ent in doc.ents:
- # name = (f'{ent.label_.upper():{10}}--{ent.text}')
- # addlist.append(name)
- # try:
- # Address = addlist[0]
- # final.append(Address)
- # addrespinlst.append(Address)
- # remove_list.append(
- # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
- # "ADDRESS--",
- # ""))
- # except IndexError:
- # final.append("ADDRESS--")
- pass
-
- ################################################## website#######################################################
-
- # import re
-
- # url = []
- # matches = re.findall(r'www.*', verticaltext)
- # for match in matches:
- # if (match.count('.')) == 1:
- # a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- # url.append(a_string1)
- # else:
-
- # final.append("Urls--" + match)
-
- # if len(url)==0:
-
- # from urlextract import URLExtract
-
- # extractor = URLExtract()
- # urls = extractor.find_urls(verticaltext)
- # try:
- # urllist = urls[0]
- # final.append("Urls--"+urllist)
- # url.append(urllist)
- # except IndexError:
- # final.append("Urls--")
-
- # for match in matches:
- # if (match.count('.')) == 1:
- # a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- # url.append(a_string1)
- # else:
-
- # final.append("Urls--" + match)
- # url.append(match)
- # remove_list.append(match)
- # else:
- # final.append("Urls--" )
-
- ################################################## website#######################################################
-
- import re
- # final=[]
- url = []
- urlfinal = []
- matches = re.findall(r'www.*', verticaltext)
- for match in matches:
-
- if (match.count('.')) == 1:
- a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- url.append(a_string1)
- else:
-
- url.append(match)
-
- if len(url) == 0:
-
- from urlextract import URLExtract
-
- extractor = URLExtract()
- urls = extractor.find_urls(verticaltext)
- try:
- urllist = urls[0]
- url.append(urllist)
- url.append(urllist)
- except IndexError:
- pass
-
- for match in matches:
- if (match.count('.')) == 1:
- a_string1 = match.replace("www", "www.")
-
- url.append(a_string1)
- # url.append(a_string1)
- else:
-
- url.append(match)
- url.append(match)
-
- else:
- pass
- try:
- test_string = url[0]
-
- test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
-
- res = [ele for ele in test_list if (ele in test_string)]
-
- if len(res) == 0:
- print('no match')
-
- final.append('urls--')
-
-
- else:
- print('matched')
- final.append('urls--' + url[0])
- urlfinal.append(url[0])
-
-
- except IndexError:
- final.append('urls--')
-
- print(
- '############################################################# url #############################################################')
- print(url)
- #######organisation and contact################
-
- # def company_url():
- # # print('--url--')
- # # print(url)
-
- # try:
- # match = str(url[0]).lower()
- # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
- # final.append("OrganizationName--" + match)
- # # remove_list.append(match)
- # except IndexError:
- # org_name()
- # organisation()
- # final.append("OrganizationName--")
-
- # make example sentence
-
- # print(horizontaltext)
- sentence = Sentence(verticaltext)
-
- # predict NER tags
- tagger.predict(sentence)
-
- # print sentence
- ko = (sentence)
-
- ko1 = str(ko).split("→")
- import pandas as pd
-
- dfg = []
- try:
- s = ko1[1].replace("", "").replace("", "").replace("/", ":")
- except IndexError:
- os.remove(found)
- return 'Invalid image'
- dfg.append(s)
- df = pd.DataFrame(dfg)
- df = df[0]
-
- df.to_csv("df.csv", index=False)
-
- df1 = pd.read_csv("df.csv")
- ve = df1["0"].str.split(",")
- fgf = ve.to_list()
- dfgh = pd.DataFrame(fgf[0])
- maindf = dfgh[0] # .str.split(":")
- # maindf.to_csv("main.csv")
-
- main1 = maindf.to_list()
- main1
- # cv=pd.DataFrame(ve)
- # cv
- per = ["PER"]
- org = ["ORG"]
- loc = ["LOC"]
- organizations = [i for i in main1 for j in org if j in i]
- PErsons = [i for i in main1 for j in per if j in i]
- location = [i for i in main1 for j in loc if j in i]
-
- # ************************************* ORGANIZATION ********************************************************************
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- ']', '').replace(
- '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
- pass
- # company_url()
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https',
- '').replace(
- 'http', '').replace(":", "").replace("/", "").upper()
- print(match)
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
- '.com', '') + " /" + \
- organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com', '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
-
-
- except IndexError:
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
- '').replace(
- '"',
- '').replace(
- '.com', ''))) < 4:
- pass
- # company_url()
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
- '').replace(
- '.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- ']', '').replace(
- '.com', '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- org_name()
- organisation()
-
- # final.append("OrganizationName--")
-
- # ************************************* CONTACT PERSON *******************************************************************
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
- "") +
- PErsons[
- 1].replace(":PER", "").replace('"', ''))
- except IndexError:
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
- '"',
- ''))
- except IndexError:
- org_name()
- contactpersonname()
- # final.append("CONTACTPERSONNAME--")
- ###############address flair#####################
-
- try:
- print(
- '############################################################# address new code #############################################################')
- loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
- loclst = [i for i in loactionlst if i in htext.lower()]
-
- textaddress = htext
- textaddress = textaddress.replace("|", ",")
- textaddress = textaddress.lower()
-
- nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
- grop = nlp(textaddress)
-
- citycountry = []
- print('########################### city or country name ###########################')
- d = grop[-1]
-
- if d['entity_group'] == "COUNTRY":
- print(d["word"])
- citycountry.append(d["word"])
- elif d['entity_group'] == "CITY":
- print(d["word"])
- citycountry.append(d["word"])
-
- try:
- address1 = loclst[0]
- except IndexError:
- address1 = (textaddress.partition(",")[0])
- words = address1.split()
- address1 = words[-1]
-
- star_location = address1.lower()
- end_location = citycountry[0].replace("#", "")
- start = star_location
- end = end_location
- s = textaddress.lower()
- middle_address = (s.split(start))[-1].split(end)[0]
- Address = start + middle_address + end
- Address = Address.replace('--', '').title()
- print(Address)
- if Address.count(',') < 2:
- splitaddress()
- else:
- final.append('ADDRESS--' + Address)
-
- # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
- # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
- # d1 = star_location.split()
- # d2 = end_location.split()
- # d3 = d1[0]
- # d4 = d2[0]
- # start = d3
- # end = d4
- # s = horizontaltext
- # middle_address = ((s.split(start))[1].split(end)[0])
- # Address = d3 + middle_address + d4
- # final.append('ADDRESS--' + Address)
- # addrespinlst.append(Address)
-
-
- except IndexError:
- splitaddress()
-
- ########################################## Designation ###########################################
- import re
- new = []
- with open('test.txt', 'r') as f:
- flag = False
- for line in f:
- line1 = line
- line = line.upper()
- matches = re.findall(
- r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
- line)
- for match in matches:
- line = line.replace('-', '')
- # print(line)
- o = "Designation--" + line
- new.append(o)
- remove_list.append(str(line1).replace('\n', ''))
-
- try:
- a = new[0].replace('\n', '')
- final.append(a)
-
- except IndexError:
- final.append("Designation--")
-
- ###################################################Phone number#################################################
- num = []
- import phonenumbers
-
- # print(verticaltext)
- numbers = phonenumbers.PhoneNumberMatcher(
- verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
-
- for number in numbers:
- number = str(number).split(")")
- num.append(number[1])
- # num.append(number[-1])
- if len(num) == 0:
- final.append("ContactNumber--")
- final.append("OrganizationNumber--")
- elif len(num) > 1:
- final.append("ContactNumber--" + num[0].replace(' ', ''))
- final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
- elif len(num) == 1:
- try:
- final.append("ContactNumber--" + num[0].replace(' ', ''))
- final.append("OrganizationNumber--")
- except IndexError:
- final.append("ContactNumber--")
- final.append("OrganizationNumber--")
- print(
- '############################################################# num #############################################################')
- print(num)
- # try:
- # final.append("PhoneNumber--" + num[0].replace(' ', ''))
- # remove_list.append(num[0])
- # except IndexError:
- # pass
- # try:
- # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
- # remove_list.append(num[1])
- # except IndexError:
- # pass
- # try:
- # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
- # remove_list.append(num[2])
- # except IndexError:
- # pass
-
- ################################################### Email######################################################
- import re
- from email_scraper import scrape_emails
- s = list(scrape_emails(horizontaltext))
- email_id = s
-
- # email_id = []
- # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
- # for match in matches:
- # email_id.append(match)
-
- # # final.append('Email--' + match)
- # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
- # # final.append(email_)
-
- # # final.append('Email--' + email_)
- # # remove_list.append(email_)
- if len(email_id) > 1:
- final.append(
- 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
- ""))
- final.append(
- 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
- "'",
- ""))
- else:
- try:
- final.append(
- 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
- "'",
- ""))
- final.append('OrganizationEmail--')
- except IndexError:
- final.append('ContactEmail--')
- final.append('OrganizationEmail--')
-
- ###############PINCODE############
-
- pinlst = []
- print(addrespinlst)
- import pgeocode
-
- # try:
- # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
- # for i in matche1:
- # address3 = i.replace(' ', '').replace('-', '')
- # pinlst.append(address3)
- # except IndexError:
-
- lst = []
- for i in num:
- i = i[1:]
- lst.append(i)
-
- infile = r"vtext.txt"
- outfile = r"cleaned_file.txt"
- import glob
- delete_list = lst
- # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
- fin = open(infile, "r+")
- fout = open(outfile, "w+")
- for line12 in fin:
- for word in delete_list:
- line12 = line12.replace(word, "")
-
- fout.write(line12)
- fin.close()
- # print(line)
-
- # print(addrespinlst)
- import pgeocode
- #print(line12)
- import re
- matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
- for i in matche1:
- address3 = i.replace(' ', '').replace('-', '')
- pinlst.append(address3)
-
- nomi = pgeocode.Nominatim('IN')
- try:
- a = nomi.query_postal_code(str(pinlst[-1]))
- # print(a)
- b = a.keys()
- c = b.values.tolist()
- d = a.tolist()
- postal_code = "PinCode1" + "--" + d[0]
- final.append(postal_code)
- country_code = c[1] + "--" + str(d[1])
- final.append(country_code)
- place_name = 'LandMark1' + "--" + str(d[2])
- final.append(place_name)
- state_name = c[3] + "--" + str(d[3])
- final.append(state_name)
- state_code = c[4] + "--" + str(d[4])
- final.append(state_code)
- county_name = 'CityName1' + "--" + str(d[5])
- final.append(county_name)
-
- except (IndexError, NameError):
- final.append("PinCode1--")
- final.append("country_code--")
- final.append("LandMark1--")
- final.append("state_name--")
- final.append("state_code--")
- final.append("CityName1--")
-
- ######################################################## json #####################################################################
-
- import pandas as pd
- df = pd.DataFrame(final)
- df1 = df[0].str.split('--', expand=True)
- # print(df1)
- df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
- df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
- df1['Keys'] = df1['Keys'].str.strip()
- df1.to_csv('path123.csv', index=False)
- df2 = pd.read_csv('path123.csv')
- print(df2)
- df2 = df2.T
- df2.to_csv('path1.csv', index=False, header=False)
- df1 = pd.read_csv('path1.csv')
- df1.to_json('firstjson1.json', orient="index")
- import json
- with open('firstjson1.json', 'r') as json_file:
- json_load = json.load(json_file)
- # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
- nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
- # # print('--------------------------------------------------------------------------')
- # # print(nothing)
- empty = []
- import base64
- name = found
- image = open(name, 'rb')
- image_read = image.read()
- image_64_encode = base64.b64encode(image_read)
- NULL = 'null'
- empty.append("ByteData--" + (NULL).strip('""'))
- image_64_encode = image_64_encode.decode('utf-8')
- empty.append("FileData--" + str(image_64_encode))
- imagedata = name.split("/")
- imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
- imagename1 = str(imagename).split('.')
- imagename = str(imagename1[-2]).replace("[", "]")
- empty.append("FileName--" + imagename)
- empty.append("FilePath--"+ "")
- imageExtension = str(imagename1[-1]).replace("[", "]")
- empty.append("FileType--" + imageExtension)
- image.close()
- import pandas as pd
- df = pd.DataFrame(empty)
- df = df[0].str.split("--", expand=True)
- data1 = pd.DataFrame(df[0])
- data2 = pd.DataFrame(df[1])
- dt = data2.set_index(data1[0])
- dt4 = dt.T
- dictionary = dt4.to_dict(orient="index")
- list1 = []
- # list.append(a)
- list1.append(dictionary[1])
- # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
- print('--------------------')
- # print(namelist)
- import json
- # JSON data:
- x = nothing
- # python object to be appended
- y = {"image": dictionary[1]}
- # parsing JSON string:
- z = json.loads(x)
- # appending the data
- z.update(y)
- # the result is a JSON string:
- # print(json.dumps(z))
-
- zlist.append(z)
- #############################################creating csv#####################################
- #print(final)
- #print(imagelist)
- #final.append('image--' + str(imagelist))
- # import requests
- # import json
-
- # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
- # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing
- # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
- # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
- # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
- # payload1 = json.dumps(zlist)
- # # print('--------------------------------------------------------------------------')
- # #print(payload1)
- # headers = {
- # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
- # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
- # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
- # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
- # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
- # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af',
-
-
- # 'Content-Type': 'application/json'
- # }
- # response = requests.request("POST", url, headers=headers, data=payload1)
- # # print("##############################################################")
-
- # print(payload1)
- # #print(zlist)
- # # import os
- # # if 'BusinessCards Created Successfully' in response.text:
- # # print('present')
- # # os.remove(found)
- # # else:
- # # print('not present')
-
- # df1.to_json('visitingcard.json')
- # data = df1.to_json('visiting.json', orient='records')
- # print(data)
-
- #return render_template('index.html')
-
-
- #return response.text
- #return z
- return zlist
-
-
-
-
- if __name__ == "__main__":
- app.run(host='0.0.0.0', port=1112)
|