1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117 |
- from flask import Flask, render_template, request, redirect, Response, send_file
- import os
- import openai
- import requests
- import pandas as pd
- import pgeocode
- from email_scraper import scrape_emails
- import phonenumbers
- from pdfminer.high_level import extract_text
- import pytesseract
- import time
- import multiprocessing
- from PIL import Image
- from functools import partial
- from urlextract import URLExtract
- import pytesseract as tess
- from PIL import Image
- import os
- import glob
-
- from pytesseract import *
- import shutil
- import cv2
- import matplotlib
- from werkzeug.utils import secure_filename
- import requests
- import spacy
- import time
- import multiprocessing
- from PIL import Image
- from functools import partial
-
- import pandas as pd
- ################################################################
- Current_Working_Directory=os.getcwd()
- Current_Working_Directory=Current_Working_Directory.replace("\\","/")
- nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")
-
- ################################################################
- # import spacy
-
- # nlp_model1 = spacy.load('./ADD3001.2')
- from flair.data import Sentence
- from flair.models import SequenceTagger
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-
- tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
- model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
-
- from paddleocr import PaddleOCR, draw_ocr
-
- ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True)
- tagger = SequenceTagger.load("flair/ner-english-large")
-
- import datetime
-
- app = Flask(__name__)
-
-
- # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
-
- @app.route('/', methods=['GET'])
- def card():
- return render_template('card.html')
-
-
- @app.route('/upload_BusinessCards', methods=["POST"])
- # @app.route('/multiplecards', methods=["POST"])
- def multiplecards():
- # print('################## multiple card detection #######################')
- # print(Dataset)
- datalist=[]
- zlist=[]
- Dataset = request.get_json()
- # print(data)
- #datalist.append(Dataset)
- data = {'visiting': Dataset}
- for i in data['visiting']:
- import time
- # time.sleep(1)
- a = i
- x = a['FileData']
- # print(x)
- y = a['FileName']
- z = a['FileType']
- # CreatedBy=a['CreatedBy']
-
- name = y + '.' + z
- # print(name)
- # print(y)
- # image = y.split("/")
- # filename=image[-1]
-
- # print(x)
- img_data = x.encode()
-
- import base64
- with open('./multicards/' + name, "wb") as fh:
- fh.write(base64.decodebytes(img_data))
- # print(i)
-
- # import os
- # import glob
- # for i in glob.glob('./multipleupload/*'):
-
- found = './multicards/' + name
- print(found)
- extension = found.split('.')[-1]
-
- # for root, dirs, fils in os.glob('./multipleupload'):
- # for name in files:
- # foundfile= os.path.join(root, name)
- # print(foundfile)
-
- import re
- import csv
- import glob
- import os
- # import pytesseract
- # import cv2
- import numpy as np
- import glob
- import os
- import cv2
- import requests
- final = []
- # final.append('assignto--'+CreatedBy)
- imagelist = []
- # print(found)
- remove_list = []
- import os
- import glob
- import pdfminer
-
- # import os
- # ts = 0
- # for file_name in glob.glob('./upload/*'):
- # fts = os.path.getmtime(file_name)
- # if fts > ts:
- # ts = fts
- # found = file_name
- # print(found)
-
- # print(extension)
-
- def org_name():
- print('org_name is working')
- import pytesseract
- fname = found
- if extension != 'pdf':
-
- img = cv2.imread(fname)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- cv2.imwrite(str(found), img)
- from PIL import Image
- im = Image.open(found)
- im.save("images1.png", dpi=(1200, 1200))
- # import pytesseract
- fname = "images1.png"
- import pytesseract as tess
- from PIL import Image
-
- tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
- pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
- with open("demo.pdf", "w+b", ) as f:
- f.write(pdf)
-
- from pdfminer.high_level import extract_text
- text = extract_text('demo.pdf')
- # doc = DocumentFile.from_images(found)
- # result = model(doc)
- # text = result.render()
-
- # from pdfminer.high_level import extract_text
- # txt = extract_text('demo.pdf')
- else:
- from pdfminer.high_level import extract_text
- text = extract_text(fname)
-
- sentence = Sentence(text)
-
- # predict NER tags
- tagger.predict(sentence)
-
- # print sentence
- ko = (sentence)
-
- ko1 = str(ko).split("→")
- import pandas as pd
-
- dfg = []
- try:
- s = ko1[1].replace("", "").replace("", "").replace("/", ":")
-
- # os.remove(found)
- # return 'Invalid image'
- dfg.append(s)
- df = pd.DataFrame(dfg)
- df = df[0]
-
- df.to_csv("df.csv", index=False)
-
- df1 = pd.read_csv("df.csv")
- ve = df1["0"].str.split(",")
- fgf = ve.to_list()
- dfgh = pd.DataFrame(fgf[0])
- maindf = dfgh[0] # .str.split(":")
- # maindf.to_csv("main.csv")
-
- main1 = maindf.to_list()
- main1
- # cv=pd.DataFrame(ve)
- # cv
- per = ["PER"]
- org = ["ORG"]
- loc = ["LOC"]
- organizations = [i for i in main1 for j in org if j in i]
- PErsons = [i for i in main1 for j in per if j in i]
- location = [i for i in main1 for j in loc if j in i]
- except IndexError:
- pass
-
- # ************************************* ORGANIZATION ********************************************************************
-
- def organisation():
- print('organisation working ')
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com', ''))) < 4:
- pass
-
-
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https',
- '').replace(
- 'http', '').replace(":", "").replace("/", "").upper()
- print(match)
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
- '') + " /" + \
- organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com',
- '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
- '').replace(
- '"',
- '').replace(
- '.com', '').replace('.in', ''))) < 4:
- pass
-
- else:
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
- '').replace(
- 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
- '[',
- '').replace(
- ']', '').replace(
- '.com', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- try:
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
- '').upper()
- final.append("OrganizationName--" + match)
- # remove_list.append(match)
- except IndexError:
- company()
-
- #################################################company Name########################################
-
- def company():
- print('company list working')
- import re
-
- new = []
- with open('test.txt', 'r+') as f:
- flag = False
- for line in f:
- line = line.upper()
- matches = re.findall(
- r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
- line)
-
- for i in matches:
- if i in line:
- flag = True
- if flag:
- o = "OrganizationName--" + line
- new.append(o)
- # if line.startswith('\n'):
- # flag = False
- try:
- a = new[0].replace('\n', '')
- final.append(a)
- except IndexError:
- final.append("OrganizationName--")
-
- # ************************************* CONTACT PERSON *******************************************************************
- def contactpersonname():
- print('contactpersonname working')
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
- "]",
- "") + '/' +
- PErsons[
- 1].replace(":PER", "").replace('"', ''))
- except IndexError:
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
- "").replace(
- '"', ''))
- except IndexError:
- final.append("CONTACTPERSONNAME--")
-
- def image_to_text():
-
- # doc = DocumentFile.from_images(found)
- # result = model(doc)
- # image_to_text.txt = result.render()
-
- # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
- # img = Image.open(found)
- # text = tess.image_to_string(img)
- # image_to_text.txt = text
- # print(text)
- import cv2
- img_path = found
- img = cv2.imread(img_path)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- cv2.imwrite(str(found), img)
-
- result = ocr.ocr(img_path, cls=True)
- result = result[0]
-
- txts = [line[1][0] for line in result]
-
- image_to_text.txt = ""
- for i in txts:
- if len(i) < 4:
- continue
- # print(i+"\n")
- image_to_text.txt = image_to_text.txt + str(i) + "\n"
- # print(image_to_text.txt)
-
- def pdf_to_text():
-
- from pdfminer.high_level import extract_text
- pdf_to_text.txt = extract_text(found)
- # pdf_to_text.txt= text.replace('\n', ' ')
-
- extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
-
- if extension in extensionlist:
- print('image' + extension)
- image_to_text()
- x = image_to_text.txt
-
- else:
- print('pdf' + extension)
- pdf_to_text()
- x = pdf_to_text.txt
-
- verticaltext = x
- htext = x
- # print('------------------------------------------------')
- #print('############################################################# this is verticaltext #################################################################')
- # print(verticaltext)
- htext = htext.replace('\n', ' ')
- # print('############################################################# this is htext #############################################################')
- #print(htext)
- y = x.replace('\n', ',')
- y = y.replace(' ', ' ')
- # y = y.replace(".", " .")
- horizontaltext = y
- # print('------------------------------------------------')
- #print('############################################################# this is horizontaltext #############################################################')
- #print(horizontaltext)
-
- textfile = open("test123456.txt", "w")
- a = textfile.write(verticaltext)
- textfile.close()
- textfile = open("vtext.txt", "w")
- a = textfile.write(horizontaltext)
- textfile.close()
- with open('test123456.txt', 'r') as f:
- with open('test.txt', 'w') as w:
- for line in f:
- if line.strip().replace('|', ''):
- w.write(line)
-
- ###########################ADDRESS##################################
- addrespinlst = []
-
- def splitaddress():
- import re
- textaddress = htext.replace('\n', ' ')
- # print(textaddress)
-
- address1 = (textaddress.partition(",")[0])
- words = address1.split()
- address1 = words[-1]
- addre = (htext.partition(",")[2])
- a = addre.replace('\n', ' ').replace('\x0c', '')
- addre = (a.partition(",")[2])
- matches = re.findall(
- r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
- a)
- for match in matches:
- address2 = match
- address2 = str(address2)
- address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
- '')
-
- matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
- for address3 in matches:
- pass
- try:
- Address = address1 + "," + address2 + "," + address3
- final.append('ADDRESS--' + Address)
- addrespinlst.append(Address)
-
- except NameError:
-
- print(
- '############################################################ Addressmodelworking #############################################################')
-
- # doc = nlp_model1(textaddress)
- # addlist = []
- # for ent in doc.ents:
- # name = (f'{ent.label_.upper():{10}}--{ent.text}')
- # addlist.append(name)
- # try:
- # Address = addlist[0]
- # final.append(Address)
- # addrespinlst.append(Address)
- # remove_list.append(
- # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
- # "ADDRESS--",
- # ""))
- # except IndexError:
- # final.append("ADDRESS--")
- pass
-
- ################################################## website#######################################################
-
- # import re
-
- # url = []
- # matches = re.findall(r'www.*', verticaltext)
- # for match in matches:
- # if (match.count('.')) == 1:
- # a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- # url.append(a_string1)
- # else:
-
- # final.append("Urls--" + match)
-
- # if len(url)==0:
-
- # from urlextract import URLExtract
-
- # extractor = URLExtract()
- # urls = extractor.find_urls(verticaltext)
- # try:
- # urllist = urls[0]
- # final.append("Urls--"+urllist)
- # url.append(urllist)
- # except IndexError:
- # final.append("Urls--")
-
- # for match in matches:
- # if (match.count('.')) == 1:
- # a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- # url.append(a_string1)
- # else:
-
- # final.append("Urls--" + match)
- # url.append(match)
- # remove_list.append(match)
- # else:
- # final.append("Urls--" )
-
- ################################################## website#######################################################
-
- import re
- # final=[]
- url = []
- urlfinal = []
- matches = re.findall(r'www.*', verticaltext)
- for match in matches:
-
- if (match.count('.')) == 1:
- a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- url.append(a_string1)
- else:
-
- url.append(match)
-
- if len(url) == 0:
-
- from urlextract import URLExtract
-
- extractor = URLExtract()
- urls = extractor.find_urls(verticaltext)
- try:
- urllist = urls[0]
- url.append(urllist)
- url.append(urllist)
- except IndexError:
- pass
-
- for match in matches:
- if (match.count('.')) == 1:
- a_string1 = match.replace("www", "www.")
-
- url.append(a_string1)
- # url.append(a_string1)
- else:
-
- url.append(match)
- url.append(match)
-
- else:
- pass
- try:
- test_string = url[0]
-
- test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
-
- res = [ele for ele in test_list if (ele in test_string)]
-
- if len(res) == 0:
- print('no match')
-
- final.append('urls--')
-
-
- else:
- print('matched')
- final.append('urls--' + url[0])
- urlfinal.append(url[0])
-
-
- except IndexError:
- final.append('urls--')
-
- print(
- '############################################################# url #############################################################')
- print(url)
- #######organisation and contact################
-
- # def company_url():
- # # print('--url--')
- # # print(url)
-
- # try:
- # match = str(url[0]).lower()
- # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
- # final.append("OrganizationName--" + match)
- # # remove_list.append(match)
- # except IndexError:
- # org_name()
- # organisation()
- # final.append("OrganizationName--")
-
- # make example sentence
-
- # print(horizontaltext)
- sentence = Sentence(verticaltext)
-
- # predict NER tags
- tagger.predict(sentence)
-
- # print sentence
- ko = (sentence)
-
- ko1 = str(ko).split("→")
- import pandas as pd
-
- dfg = []
- try:
- s = ko1[1].replace("", "").replace("", "").replace("/", ":")
- except IndexError:
- os.remove(found)
- return 'Invalid image'
- dfg.append(s)
- df = pd.DataFrame(dfg)
- df = df[0]
-
- df.to_csv("df.csv", index=False)
-
- df1 = pd.read_csv("df.csv")
- ve = df1["0"].str.split(",")
- fgf = ve.to_list()
- dfgh = pd.DataFrame(fgf[0])
- maindf = dfgh[0] # .str.split(":")
- # maindf.to_csv("main.csv")
-
- main1 = maindf.to_list()
- main1
- # cv=pd.DataFrame(ve)
- # cv
- per = ["PER"]
- org = ["ORG"]
- loc = ["LOC"]
- organizations = [i for i in main1 for j in org if j in i]
- PErsons = [i for i in main1 for j in per if j in i]
- location = [i for i in main1 for j in loc if j in i]
-
- # ************************************* ORGANIZATION ********************************************************************
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- ']', '').replace(
- '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
- pass
- # company_url()
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https',
- '').replace(
- 'http', '').replace(":", "").replace("/", "").upper()
- print(match)
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
- '.com', '') + " /" + \
- organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com', '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
-
-
- except IndexError:
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
- '').replace(
- '"',
- '').replace(
- '.com', ''))) < 4:
- pass
- # company_url()
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
- '').replace(
- '.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- ']', '').replace(
- '.com', '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- org_name()
- organisation()
-
- # final.append("OrganizationName--")
-
- # ************************************* CONTACT PERSON *******************************************************************
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
- "") +
- PErsons[
- 1].replace(":PER", "").replace('"', ''))
- except IndexError:
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
- '"',
- ''))
- except IndexError:
- org_name()
- contactpersonname()
- # final.append("CONTACTPERSONNAME--")
- ###############address flair#####################
-
- try:
- print(
- '############################################################# address new code #############################################################')
- loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
- loclst = [i for i in loactionlst if i in htext.lower()]
-
- textaddress = htext
- textaddress = textaddress.replace("|", ",")
- textaddress = textaddress.lower()
-
- nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
- grop = nlp(textaddress)
-
- citycountry = []
- print('########################### city or country name ###########################')
- d = grop[-1]
-
- if d['entity_group'] == "COUNTRY":
- print(d["word"])
- citycountry.append(d["word"])
- elif d['entity_group'] == "CITY":
- print(d["word"])
- citycountry.append(d["word"])
-
- try:
- address1 = loclst[0]
- except IndexError:
- address1 = (textaddress.partition(",")[0])
- words = address1.split()
- address1 = words[-1]
-
- star_location = address1.lower()
- end_location = citycountry[0].replace("#", "")
- start = star_location
- end = end_location
- s = textaddress.lower()
- middle_address = (s.split(start))[-1].split(end)[0]
- Address = start + middle_address + end
- Address = Address.replace('--', '').title()
- print(Address)
- if Address.count(',') < 2:
- splitaddress()
- else:
- final.append('ADDRESS--' + Address)
-
- # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
- # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
- # d1 = star_location.split()
- # d2 = end_location.split()
- # d3 = d1[0]
- # d4 = d2[0]
- # start = d3
- # end = d4
- # s = horizontaltext
- # middle_address = ((s.split(start))[1].split(end)[0])
- # Address = d3 + middle_address + d4
- # final.append('ADDRESS--' + Address)
- # addrespinlst.append(Address)
-
-
- except IndexError:
- splitaddress()
-
- ########################################## Designation ###########################################
- import re
- new = []
- with open('test.txt', 'r') as f:
- flag = False
- for line in f:
- line1 = line
- line = line.upper()
- matches = re.findall(
- r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
- line)
- for match in matches:
- line = line.replace('-', '')
- # print(line)
- o = "Designation--" + line
- new.append(o)
- remove_list.append(str(line1).replace('\n', ''))
-
- try:
- a = new[0].replace('\n', '')
- final.append(a)
-
- except IndexError:
- final.append("Designation--")
-
- ###################################################Phone number#################################################
- num = []
- import phonenumbers
-
- # print(verticaltext)
- numbers = phonenumbers.PhoneNumberMatcher(
- verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
-
- for number in numbers:
- number = str(number).split(")")
- num.append(number[1])
- # num.append(number[-1])
- if len(num) == 0:
- final.append("ContactNumber--")
- final.append("OrganizationNumber--")
- elif len(num) > 1:
- final.append("ContactNumber--" + num[0].replace(' ', ''))
- final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
- elif len(num) == 1:
- try:
- final.append("ContactNumber--" + num[0].replace(' ', ''))
- final.append("OrganizationNumber--")
- except IndexError:
- final.append("ContactNumber--")
- final.append("OrganizationNumber--")
- print(
- '############################################################# num #############################################################')
- print(num)
- # try:
- # final.append("PhoneNumber--" + num[0].replace(' ', ''))
- # remove_list.append(num[0])
- # except IndexError:
- # pass
- # try:
- # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
- # remove_list.append(num[1])
- # except IndexError:
- # pass
- # try:
- # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
- # remove_list.append(num[2])
- # except IndexError:
- # pass
-
- ################################################### Email######################################################
- import re
- from email_scraper import scrape_emails
- s = list(scrape_emails(horizontaltext))
- email_id = s
-
- # email_id = []
- # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
- # for match in matches:
- # email_id.append(match)
-
- # # final.append('Email--' + match)
- # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
- # # final.append(email_)
-
- # # final.append('Email--' + email_)
- # # remove_list.append(email_)
- if len(email_id) > 1:
- final.append(
- 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
- ""))
- final.append(
- 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
- "'",
- ""))
- else:
- try:
- final.append(
- 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
- "'",
- ""))
- final.append('OrganizationEmail--')
- except IndexError:
- final.append('ContactEmail--')
- final.append('OrganizationEmail--')
-
- ###############PINCODE############
-
- pinlst = []
- print(addrespinlst)
- import pgeocode
-
- # try:
- # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
- # for i in matche1:
- # address3 = i.replace(' ', '').replace('-', '')
- # pinlst.append(address3)
- # except IndexError:
-
- lst = []
- for i in num:
- i = i[1:]
- lst.append(i)
-
- infile = r"vtext.txt"
- outfile = r"cleaned_file.txt"
- import glob
- delete_list = lst
- # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
- fin = open(infile, "r+")
- fout = open(outfile, "w+")
- for line12 in fin:
- for word in delete_list:
- line12 = line12.replace(word, "")
-
- fout.write(line12)
- fin.close()
- # print(line)
-
- # print(addrespinlst)
- import pgeocode
- #print(line12)
- import re
- matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
- for i in matche1:
- address3 = i.replace(' ', '').replace('-', '')
- pinlst.append(address3)
-
- nomi = pgeocode.Nominatim('IN')
- try:
- a = nomi.query_postal_code(str(pinlst[-1]))
- # print(a)
- b = a.keys()
- c = b.values.tolist()
- d = a.tolist()
- postal_code = "PinCode1" + "--" + d[0]
- final.append(postal_code)
- country_code = c[1] + "--" + str(d[1])
- final.append(country_code)
- place_name = 'LandMark1' + "--" + str(d[2])
- final.append(place_name)
- state_name = c[3] + "--" + str(d[3])
- final.append(state_name)
- state_code = c[4] + "--" + str(d[4])
- final.append(state_code)
- county_name = 'CityName1' + "--" + str(d[5])
- final.append(county_name)
-
- except (IndexError, NameError):
- final.append("PinCode1--")
- final.append("country_code--")
- final.append("LandMark1--")
- final.append("state_name--")
- final.append("state_code--")
- final.append("CityName1--")
-
- ######################################################## json #####################################################################
-
- import pandas as pd
- df = pd.DataFrame(final)
- df1 = df[0].str.split('--', expand=True)
- # print(df1)
- df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
- df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
- df1['Keys'] = df1['Keys'].str.strip()
- df1.to_csv('path123.csv', index=False)
- df2 = pd.read_csv('path123.csv')
- print(df2)
- df2 = df2.T
- df2.to_csv('path1.csv', index=False, header=False)
- df1 = pd.read_csv('path1.csv')
- df1.to_json('firstjson1.json', orient="index")
- import json
- with open('firstjson1.json', 'r') as json_file:
- json_load = json.load(json_file)
- # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
- nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
- # # print('--------------------------------------------------------------------------')
- # # print(nothing)
- empty = []
- import base64
- name = found
- image = open(name, 'rb')
- image_read = image.read()
- image_64_encode = base64.b64encode(image_read)
- NULL = 'null'
- empty.append("ByteData--" + (NULL).strip('""'))
- image_64_encode = image_64_encode.decode('utf-8')
- empty.append("FileData--" + str(image_64_encode))
- imagedata = name.split("/")
- imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
- imagename1 = str(imagename).split('.')
- imagename = str(imagename1[-2]).replace("[", "]")
- empty.append("FileName--" + imagename)
- empty.append("FilePath--"+ "")
- imageExtension = str(imagename1[-1]).replace("[", "]")
- empty.append("FileType--" + imageExtension)
- image.close()
- import pandas as pd
- df = pd.DataFrame(empty)
- df = df[0].str.split("--", expand=True)
- data1 = pd.DataFrame(df[0])
- data2 = pd.DataFrame(df[1])
- dt = data2.set_index(data1[0])
- dt4 = dt.T
- dictionary = dt4.to_dict(orient="index")
- list1 = []
- # list.append(a)
- list1.append(dictionary[1])
- # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
- print('--------------------')
- # print(namelist)
- import json
- # JSON data:
- x = nothing
- # python object to be appended
- y = {"image": dictionary[1]}
- # parsing JSON string:
- z = json.loads(x)
- # appending the data
- z.update(y)
- # the result is a JSON string:
- # print(json.dumps(z))
-
- zlist.append(z)
- #############################################creating csv#####################################
- #print(final)
- #print(imagelist)
- #final.append('image--' + str(imagelist))
- # import requests
- # import json
-
- # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
- # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing
- # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
- # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
- # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
- # payload1 = json.dumps(zlist)
- # # print('--------------------------------------------------------------------------')
- # #print(payload1)
- # headers = {
- # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
- # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
- # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
- # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
- # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
- # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af',
-
-
- # 'Content-Type': 'application/json'
- # }
- # response = requests.request("POST", url, headers=headers, data=payload1)
- # # print("##############################################################")
-
- # print(payload1)
- # #print(zlist)
- # # import os
- # # if 'BusinessCards Created Successfully' in response.text:
- # # print('present')
- # # os.remove(found)
- # # else:
- # # print('not present')
-
- # df1.to_json('visitingcard.json')
- # data = df1.to_json('visiting.json', orient='records')
- # print(data)
-
- #return render_template('index.html')
-
-
- #return response.text
- #return z
- return zlist
-
-
-
-
- if __name__ == "__main__":
- app.run(host='0.0.0.0', port=1112)
|