12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207 |
- from flask import Flask, render_template, request, redirect, Response, send_file
- import os
- import requests
- import pandas as pd
- import pgeocode
- from email_scraper import scrape_emails
- import phonenumbers
- from pdfminer.high_level import extract_text
- import pytesseract
- import time
- import multiprocessing
- from PIL import Image
- from functools import partial
- from urlextract import URLExtract
- import pytesseract as tess
- from PIL import Image
- # from doctr.io import DocumentFile
- # from doctr.models import ocr_predictor
- # model = ocr_predictor(pretrained=True)
- # load tagger
-
- # import spacy
-
- # nlp_model1 = spacy.load('./ADD300_new3.0')
- from flair.data import Sentence
- from flair.models import SequenceTagger
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-
- tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
- model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
-
- from paddleocr import PaddleOCR, draw_ocr
-
- ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True)
- tagger = SequenceTagger.load("flair/ner-english-large")
-
- import datetime
-
- app = Flask(__name__)
-
-
- # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
-
-
- @app.route('/', methods=['GET'])
- def resume():
- return render_template('index.html')
-
-
- #@app.route('/upload_BusinessCards', methods=["POST"])
- def predict(Dataset):
- print('################## single card detection #######################')
- starttime = datetime.datetime.now()
- print('Execution Started at:', starttime)
-
- # print(Dataset)
- import os
-
- # if request.method == "POST":
-
- # if request.files:
-
- # image = request.files["image"]
- # try:
- # image.save(os.path.join(
- # app.config["IMAGE_UPLOADS"], image.filename))
- # except IsADirectoryError:
- # return render_template('card.html')
- # # image.save(os.path.join(
- # # app.config["IMAGE_UPLOADS"], image.filename))
-
- # print("Image saved")
-
- # return redirect(request.url)
- #url_list = request.get_json()
- # print(Dataset)
- # print(url_list)
- #dataset = request.get_json()
- # print(data)
- # data = {'visiting': Dataset}
- a=Dataset[0]
- #a = url_list
- # print(a)
- x = a['FileData']
- # print(x)
- y = a['FileName']
- z = a['FileType']
- # CreatedBy=a['CreatedBy']
-
- name = y + '.' + z
- # print(name)
- # print(y)
- # image = y.split("/")
- # filename=image[-1]
-
- # print(x)
- img_data = x.encode()
-
- import base64
- with open('./upload/' + name, "wb") as fh:
- fh.write(base64.decodebytes(img_data))
- import re
- import csv
- import glob
- import os
- # import pytesseract
- # import cv2
- import numpy as np
-
- import glob
- import os
- import cv2
- import requests
-
- final = []
- # final.append('assignto--'+CreatedBy)
- imagelist = []
-
- # print(found)
-
- remove_list = []
- import os
- import glob
- import pdfminer
-
- # import os
- # ts = 0
- # for file_name in glob.glob('./upload/*'):
- # fts = os.path.getmtime(file_name)
- # if fts > ts:
- # ts = fts
- # found = file_name
- found = './upload/' + name
- print(found)
- extension = found.split('.')[-1]
-
- # print(extension)
-
- def org_name():
- print('org_name is working')
- import pytesseract
- fname = found
- if extension != 'pdf':
-
- img = cv2.imread(fname)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- cv2.imwrite(str(found), img)
- from PIL import Image
- im = Image.open(found)
- im.save("images1.png", dpi=(1200, 1200))
- # import pytesseract
- fname = "images1.png"
- import pytesseract as tess
- from PIL import Image
-
- tess.pytesseract.tesseract_cmd=r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
- pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
- with open("demo.pdf","w+b",) as f:
- f.write(pdf)
-
- from pdfminer.high_level import extract_text
- text = extract_text('demo.pdf')
-
- # doc = DocumentFile.from_images(found)
- # result = model(doc)
- # text = result.render()
-
- # from pdfminer.high_level import extract_text
- # txt = extract_text('demo.pdf')
- else:
- from pdfminer.high_level import extract_text
- text = extract_text(fname)
-
- sentence = Sentence(text)
-
- # predict NER tags
- tagger.predict(sentence)
-
- # print sentence
- ko = (sentence)
-
- ko1 = str(ko).split("→")
- import pandas as pd
-
- dfg = []
- try:
- s = ko1[1].replace("", "").replace("", "").replace("/", ":")
-
- # os.remove(found)
- # return 'Invalid image'
- dfg.append(s)
- df = pd.DataFrame(dfg)
- df = df[0]
-
- df.to_csv("df.csv", index=False)
-
- df1 = pd.read_csv("df.csv")
- ve = df1["0"].str.split(",")
- fgf = ve.to_list()
- dfgh = pd.DataFrame(fgf[0])
- maindf = dfgh[0] # .str.split(":")
- # maindf.to_csv("main.csv")
-
- main1 = maindf.to_list()
- main1
- # cv=pd.DataFrame(ve)
- # cv
- per = ["PER"]
- org = ["ORG"]
- loc = ["LOC"]
- organizations = [i for i in main1 for j in org if j in i]
- PErsons = [i for i in main1 for j in per if j in i]
- location = [i for i in main1 for j in loc if j in i]
- except IndexError:
- pass
-
- # ************************************* ORGANIZATION ********************************************************************
-
- def organisation():
- print('organisation working ')
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com', ''))) < 4:
- pass
-
-
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https',
- '').replace(
- 'http', '').replace(":", "").replace("/", "").upper()
- print(match)
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
- '') + " /" + \
- organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com',
- '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
- '').replace(
- '"',
- '').replace(
- '.com', '').replace('.in', ''))) < 4:
- pass
-
- else:
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- ']', '').replace(
- '.com', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- try:
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').upper()
- final.append("OrganizationName--" + match)
- # remove_list.append(match)
- except IndexError:
- company()
-
- #################################################company Name########################################
-
- def company():
- print('company list working')
- import re
-
- new = []
- with open('test.txt', 'r+') as f:
- flag = False
- for line in f:
- line = line.upper()
- matches = re.findall(
- r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
- line)
-
- for i in matches:
- if i in line:
- flag = True
- if flag:
- o = "OrganizationName--" + line
- new.append(o)
- # if line.startswith('\n'):
- # flag = False
- try:
- a = new[0].replace('\n', '')
- final.append(a)
- except IndexError:
- final.append("OrganizationName--")
-
- # ************************************* CONTACT PERSON *******************************************************************
- def contactpersonname():
- print('contactpersonname working')
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
- "") + '/' +
- PErsons[
- 1].replace(":PER", "").replace('"', ''))
- except IndexError:
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
- '"', ''))
- except IndexError:
- final.append("CONTACTPERSONNAME--")
-
- def image_to_text():
-
- # doc = DocumentFile.from_images(found)
- # result = model(doc)
- # image_to_text.txt = result.render()
-
- # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
- # img = Image.open(found)
- # text = tess.image_to_string(img)
- # image_to_text.txt = text
- # print(text)
- import cv2
- img_path = found
- img = cv2.imread(img_path)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- cv2.imwrite(str(found), img)
-
- result = ocr.ocr(img_path, cls=True)
- result = result[0]
-
- txts = [line[1][0] for line in result]
-
- image_to_text.txt = ""
- for i in txts:
- if len(i) < 4:
- continue
- # print(i+"\n")
- image_to_text.txt = image_to_text.txt + str(i) + "\n"
- # print(image_to_text.txt)
-
- def pdf_to_text():
-
- from pdfminer.high_level import extract_text
- pdf_to_text.txt = extract_text(found)
- # pdf_to_text.txt= text.replace('\n', ' ')
-
- extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
-
- if extension in extensionlist:
- print('image' + extension)
- image_to_text()
- x = image_to_text.txt
-
- else:
- print('pdf' + extension)
- pdf_to_text()
- x = pdf_to_text.txt
-
- verticaltext = x
- htext = x
- # print('------------------------------------------------')
- print(
- '############################################################# this is verticaltext #################################################################')
- print(verticaltext)
- htext = htext.replace('\n', ' ')
- print(
- '############################################################# this is htext #############################################################')
- print(htext)
- y = x.replace('\n', ',')
- y = y.replace(' ', ' ')
- # y = y.replace(".", " .")
- horizontaltext = y
- # print('------------------------------------------------')
- print(
- '############################################################# this is horizontaltext #############################################################')
- print(horizontaltext)
-
- textfile = open("test123456.txt", "w")
- a = textfile.write(verticaltext)
- textfile.close()
- textfile = open("vtext.txt", "w")
- a = textfile.write(horizontaltext)
- textfile.close()
- with open('test123456.txt', 'r') as f:
- with open('test.txt', 'w') as w:
- for line in f:
- if line.strip().replace('|', ''):
- w.write(line)
-
- ###########################ADDRESS##################################
- addrespinlst = []
-
- def splitaddress():
- import re
- textaddress = htext.replace('\n', ' ')
- # print(textaddress)
-
- address1 = (textaddress.partition(",")[0])
- words = address1.split()
- address1 = words[-1]
- addre = (htext.partition(",")[2])
- a = addre.replace('\n', ' ').replace('\x0c', '')
- addre = (a.partition(",")[2])
- matches = re.findall(
- r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
- a)
- for match in matches:
- address2 = match
- address2 = str(address2)
- address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', '')
-
- matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
- for address3 in matches:
- pass
- try:
- Address = address1 + "," + address2 + "," + address3
- final.append('ADDRESS--' + Address)
- addrespinlst.append(Address)
-
- except NameError:
-
- print(
- '############################################################ Addressmodelworking #############################################################')
-
- # doc = nlp_model1(textaddress)
- # addlist = []
- # for ent in doc.ents:
- # name = (f'{ent.label_.upper():{10}}--{ent.text}')
- # addlist.append(name)
- # try:
- # Address = addlist[0]
- # final.append(Address)
- # addrespinlst.append(Address)
- # remove_list.append(
- # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
- # "ADDRESS--",
- # ""))
- # except IndexError:
- # final.append("ADDRESS--")
- pass
- ################################################## website#######################################################
-
- # import re
-
- # url = []
- # matches = re.findall(r'www.*', verticaltext)
- # for match in matches:
- # if (match.count('.')) == 1:
- # a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- # url.append(a_string1)
- # else:
-
- # final.append("Urls--" + match)
-
- # if len(url)==0:
-
- # from urlextract import URLExtract
-
- # extractor = URLExtract()
- # urls = extractor.find_urls(verticaltext)
- # try:
- # urllist = urls[0]
- # final.append("Urls--"+urllist)
- # url.append(urllist)
- # except IndexError:
- # final.append("Urls--")
-
- # for match in matches:
- # if (match.count('.')) == 1:
- # a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- # url.append(a_string1)
- # else:
-
- # final.append("Urls--" + match)
- # url.append(match)
- # remove_list.append(match)
- # else:
- # final.append("Urls--" )
-
- ################################################## website#######################################################
-
- import re
- # final=[]
- url = []
- urlfinal = []
- matches = re.findall(r'www.*', verticaltext)
- for match in matches:
-
- if (match.count('.')) == 1:
- a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- url.append(a_string1)
- else:
-
- url.append(match)
-
- if len(url) == 0:
-
- from urlextract import URLExtract
-
- extractor = URLExtract()
- urls = extractor.find_urls(verticaltext)
- try:
- urllist = urls[0]
- url.append(urllist)
- url.append(urllist)
- except IndexError:
- pass
-
- for match in matches:
- if (match.count('.')) == 1:
- a_string1 = match.replace("www", "www.")
-
- url.append(a_string1)
- # url.append(a_string1)
- else:
-
- url.append(match)
- url.append(match)
-
- else:
- pass
- try:
- test_string = url[0]
-
- test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
-
- res = [ele for ele in test_list if (ele in test_string)]
-
- if len(res) == 0:
- print('no match')
-
- final.append('urls--')
-
-
- else:
- print('matched')
- final.append('urls--' + url[0])
- urlfinal.append(url[0])
-
-
- except IndexError:
- final.append('urls--')
-
- print(
- '############################################################# url #############################################################')
- print(url)
- #######organisation and contact################
-
- # def company_url():
- # # print('--url--')
- # # print(url)
-
- # try:
- # match = str(url[0]).lower()
- # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
- # final.append("OrganizationName--" + match)
- # # remove_list.append(match)
- # except IndexError:
- # org_name()
- # organisation()
- # final.append("OrganizationName--")
-
- # make example sentence
-
- # print(horizontaltext)
- sentence = Sentence(verticaltext)
-
- # predict NER tags
- tagger.predict(sentence)
-
- # print sentence
- ko = (sentence)
-
- ko1 = str(ko).split("→")
- import pandas as pd
-
- dfg = []
- try:
- s = ko1[1].replace("", "").replace("", "").replace("/", ":")
- except IndexError:
- os.remove(found)
- return 'Invalid image'
- dfg.append(s)
- df = pd.DataFrame(dfg)
- df = df[0]
-
- df.to_csv("df.csv", index=False)
-
- df1 = pd.read_csv("df.csv")
- ve = df1["0"].str.split(",")
- fgf = ve.to_list()
- dfgh = pd.DataFrame(fgf[0])
- maindf = dfgh[0] # .str.split(":")
- # maindf.to_csv("main.csv")
-
- main1 = maindf.to_list()
- main1
- # cv=pd.DataFrame(ve)
- # cv
- per = ["PER"]
- org = ["ORG"]
- loc = ["LOC"]
- organizations = [i for i in main1 for j in org if j in i]
- PErsons = [i for i in main1 for j in per if j in i]
- location = [i for i in main1 for j in loc if j in i]
-
- # ************************************* ORGANIZATION ********************************************************************
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(
- ']', '').replace(
- '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
- pass
- # company_url()
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace('https',
- '').replace(
- 'http', '').replace(":", "").replace("/", "").upper()
- print(match)
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
- '.com', '') + " /" + \
- organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace(
- '.com', '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
-
-
- except IndexError:
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
- '').replace(
- '"',
- '').replace(
- '.com', ''))) < 4:
- pass
- # company_url()
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
- '.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- ']', '').replace(
- '.com', '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- org_name()
- organisation()
-
- # final.append("OrganizationName--")
-
- # ************************************* CONTACT PERSON *******************************************************************
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", "") +
- PErsons[
- 1].replace(":PER", "").replace('"', ''))
- except IndexError:
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace('"',
- ''))
- except IndexError:
- org_name()
- contactpersonname()
- # final.append("CONTACTPERSONNAME--")
- ###############address flair#####################
-
- try:
- print(
- '############################################################# address new code #############################################################')
- loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
- loclst = [i for i in loactionlst if i in htext.lower()]
-
- textaddress = htext
- textaddress = textaddress.replace("|", ",")
- textaddress = textaddress.lower()
-
- nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
- grop = nlp(textaddress)
-
- citycountry = []
- print('########################### city or country name ###########################')
- d = grop[-1]
-
- if d['entity_group'] == "COUNTRY":
- print(d["word"])
- citycountry.append(d["word"])
- elif d['entity_group'] == "CITY":
- print(d["word"])
- citycountry.append(d["word"])
-
- try:
- address1 = loclst[0]
- except IndexError:
- address1 = (textaddress.partition(",")[0])
- words = address1.split()
- address1 = words[-1]
-
- star_location = address1.lower()
- end_location = citycountry[0].replace("#", "")
- start = star_location
- end = end_location
- s = textaddress.lower()
- middle_address = (s.split(start))[-1].split(end)[0]
- Address = start + middle_address + end
- Address = Address.replace('--', '').title()
- print(Address)
- if Address.count(',') < 2:
- splitaddress()
- else:
- final.append('ADDRESS--' + Address)
-
- # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
- # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
- # d1 = star_location.split()
- # d2 = end_location.split()
- # d3 = d1[0]
- # d4 = d2[0]
- # start = d3
- # end = d4
- # s = horizontaltext
- # middle_address = ((s.split(start))[1].split(end)[0])
- # Address = d3 + middle_address + d4
- # final.append('ADDRESS--' + Address)
- # addrespinlst.append(Address)
-
-
- except IndexError:
- splitaddress()
-
- ########################################## Designation ###########################################
- import re
- new = []
- with open('test.txt', 'r') as f:
- flag = False
- for line in f:
- line1 = line
- line = line.upper()
- matches = re.findall(
- r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
- line)
- for match in matches:
- line = line.replace('-', '')
- # print(line)
- o = "Designation--" + line
- new.append(o)
- remove_list.append(str(line1).replace('\n', ''))
-
- try:
- a = new[0].replace('\n', '')
- final.append(a)
-
- except IndexError:
- final.append("Designation--")
-
- ###################################################Phone number#################################################
- num = []
- import phonenumbers
-
- # print(verticaltext)
- numbers = phonenumbers.PhoneNumberMatcher(
- verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
-
- for number in numbers:
- number = str(number).split(")")
- num.append(number[1])
- # num.append(number[-1])
- if len(num) == 0:
- final.append("ContactNumber--")
- final.append("OrganizationNumber--")
- elif len(num) > 1:
- final.append("ContactNumber--" + num[0].replace(' ', ''))
- final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
- elif len(num) == 1:
- try:
- final.append("ContactNumber--" + num[0].replace(' ', ''))
- final.append("OrganizationNumber--")
- except IndexError:
- final.append("ContactNumber--")
- final.append("OrganizationNumber--")
- print(
- '############################################################# num #############################################################')
- print(num)
- # try:
- # final.append("PhoneNumber--" + num[0].replace(' ', ''))
- # remove_list.append(num[0])
- # except IndexError:
- # pass
- # try:
- # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
- # remove_list.append(num[1])
- # except IndexError:
- # pass
- # try:
- # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
- # remove_list.append(num[2])
- # except IndexError:
- # pass
-
- ################################################### Email######################################################
- import re
- from email_scraper import scrape_emails
- s = list(scrape_emails(horizontaltext))
- email_id = s
-
- # email_id = []
- # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
- # for match in matches:
- # email_id.append(match)
-
- # # final.append('Email--' + match)
- # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
- # # final.append(email_)
-
- # # final.append('Email--' + email_)
- # # remove_list.append(email_)
- if len(email_id) > 1:
- final.append(
- 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", ""))
- final.append(
- 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
- ""))
- else:
- try:
- final.append(
- 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
- ""))
- final.append('OrganizationEmail--')
- except IndexError:
- final.append('ContactEmail--')
- final.append('OrganizationEmail--')
-
- ###############PINCODE############
-
- pinlst = []
- print(addrespinlst)
- import pgeocode
-
- # try:
- # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
- # for i in matche1:
- # address3 = i.replace(' ', '').replace('-', '')
- # pinlst.append(address3)
- # except IndexError:
-
- lst = []
- for i in num:
- i = i[1:]
- lst.append(i)
-
- infile = r"vtext.txt"
- outfile = r"cleaned_file.txt"
- import glob
- delete_list = lst
- # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
- fin = open(infile, "r+")
- fout = open(outfile, "w+")
- for line12 in fin:
- for word in delete_list:
- line12 = line12.replace(word, "")
-
- fout.write(line12)
- fin.close()
- # print(line)
-
- # print(addrespinlst)
- import pgeocode
- print(line12)
- import re
- matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
- for i in matche1:
- address3 = i.replace(' ', '').replace('-', '')
- pinlst.append(address3)
-
- nomi = pgeocode.Nominatim('IN')
- try:
- a = nomi.query_postal_code(str(pinlst[-1]))
- # print(a)
- b = a.keys()
- c = b.values.tolist()
- d = a.tolist()
- postal_code = "PinCode1" + "--" + d[0]
- final.append(postal_code)
- country_code = c[1] + "--" + str(d[1])
- final.append(country_code)
- place_name = 'LandMark1' + "--" + str(d[2])
- final.append(place_name)
- state_name = c[3] + "--" + str(d[3])
- final.append(state_name)
- state_code = c[4] + "--" + str(d[4])
- final.append(state_code)
- county_name = 'CityName1' + "--" + str(d[5])
- final.append(county_name)
-
- except (IndexError, NameError):
- final.append("PinCode1--")
- final.append("country_code--")
- final.append("LandMark1--")
- final.append("state_name--")
- final.append("state_code--")
- final.append("CityName1--")
-
- ######################################################## json #####################################################################
- import pandas as pd
-
- df = pd.DataFrame(final)
-
- df1 = df[0].str.split('--', expand=True)
- # print(df1)
- df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
- df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
- df1['Keys']=df1['Keys'].str.strip()
- df1.to_csv('path12.csv', index=False)
-
- df2 = pd.read_csv('path12.csv')
- print(final)
- print(df2)
- df2 = df2.T
-
- df2.to_csv('path.csv', index=False, header=False)
- df1 = pd.read_csv('path.csv')
- df1.to_json('firstjson.json', orient="index")
-
- import json
-
- with open('firstjson.json', 'r') as json_file:
- json_load = json.load(json_file)
-
- # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
-
- nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
- # print('--------------------------------------------------------------------------')
- # print(nothing)
-
- empty = []
- import base64
-
- name = found
- image = open(name, 'rb')
- image_read = image.read()
- image_64_encode = base64.b64encode(image_read)
- NULL = 'null'
- empty.append("ByteData--" + (NULL).strip('""'))
- image_64_encode = image_64_encode.decode('utf-8')
- empty.append("FileData--" + str(image_64_encode))
- imagedata = name.split("/")
- imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
- imagename1 = str(imagename).split('.')
-
- imagename = str(imagename1[-2]).replace("[", "]")
- empty.append("FileName--" + imagename)
- empty.append("FilePath--" + found)
- imageExtension = str(imagename1[-1]).replace("[", "]")
- empty.append("FileType--" + imageExtension)
- image.close()
- import pandas as pd
- df = pd.DataFrame(empty)
- df = df[0].str.split("--", expand=True)
- data1 = pd.DataFrame(df[0])
- data2 = pd.DataFrame(df[1])
- dt = data2.set_index(data1[0])
-
- dt4 = dt.T
-
- dictionary = dt4.to_dict(orient="index")
- list1 = []
-
- # list.append(a)
- list1.append(dictionary[1])
- # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
- print('--------------------')
- # print(namelist)
- import json
-
- # JSON data:
- x = nothing
-
- # python object to be appended
- y = {"image": dictionary[1]}
-
- # parsing JSON string:
- z = json.loads(x)
-
- # appending the data
- z.update(y)
-
- # the result is a JSON string:
- # print(json.dumps(z))
- # print('##########################')
- # #print(z)
- # print('##########################')
-
- # #############################################creating csv#####################################
- # # print(final)
- # # print(imagelist)
- # final.append('image--' + str(imagelist))
- #
- # import requests
- # import json
-
- # with open('visitingcard1.json', 'r') as json_file:
- # json_load = json.load(json_file)
- # url = "https://demo.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create"
- url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create"
- # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
-
- payload1 = json.dumps(z)
- # print('--------------------------------------------------------------------------')
- # print(payload1)
- headers = {
- 'Authorization': 'stat db226c95fae04943aa3e3c03a4381b2a',
- # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
- # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demosss
- 'Content-Type': 'application/json'
- }
- response = requests.request("POST", url, headers=headers, data=payload1)
- print("##############################################################")
-
- # print(payload1)
- print(response.text)
- if 'BusinessCards Created Successfully' in response.text:
- print('present')
- os.remove(found)
- else:
- print('not present')
-
- # df1.to_json('visitingcard.json')
- # data = df1.to_json('visiting.json', orient='records')
- # print(data)
-
- # return render_template('index.html')
-
-
-
- # print('Time Taken:',total)
- endtime = datetime.datetime.now()
- print('Completed at:', endtime)
- print(starttime)
- print(endtime)
- print('--------------------------')
-
- # z=end-start
- # print('Time Taken:',z)
- # return response.text
- # return 'done'
- return response.text
-
- #@app.route('/upload_BusinessCards', methods=["POST"])
- #@app.route('/multiplecards', methods=["POST"])
- def multiplecards(Dataset):
- print('################## multiple card detection #######################')
- #print(Dataset)
- #dataset = request.get_json()
- # print(data)
- data = {'visiting': Dataset}
- for i in data['visiting']:
- import time
- #time.sleep(1)
- a = i
- x = a['FileData']
- # print(x)
- y = a['FileName']
- z = a['FileType']
- # CreatedBy=a['CreatedBy']
-
- name = y + '.' + z
- # print(name)
- # print(y)
- # image = y.split("/")
- # filename=image[-1]
-
- # print(x)
- img_data = x.encode()
-
- import base64
- with open('./multicards/' + name, "wb") as fh:
- fh.write(base64.decodebytes(img_data))
- # print(i)
-
- # import os
- # import glob
- # for i in glob.glob('./multipleupload/*'):
-
- found = './multicards/' + name
- print(found)
- extension = found.split('.')[-1]
-
- # for root, dirs, fils in os.glob('./multipleupload'):
- # for name in files:
- # foundfile= os.path.join(root, name)
- # print(foundfile)
-
- import re
- import csv
- import glob
- import os
- # import pytesseract
- # import cv2
- import numpy as np
- import glob
- import os
- import cv2
- import requests
- final = []
- # final.append('assignto--'+CreatedBy)
- imagelist = []
- # print(found)
- remove_list = []
- import os
- import glob
- import pdfminer
-
- # import os
- # ts = 0
- # for file_name in glob.glob('./upload/*'):
- # fts = os.path.getmtime(file_name)
- # if fts > ts:
- # ts = fts
- # found = file_name
- # print(found)
-
-
-
- # print(extension)
-
- def org_name():
- print('org_name is working')
- import pytesseract
- fname = found
- if extension != 'pdf':
-
- img = cv2.imread(fname)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- cv2.imwrite(str(found), img)
- from PIL import Image
- im = Image.open(found)
- im.save("images1.png", dpi=(1200, 1200))
- # import pytesseract
- fname = "images1.png"
- import pytesseract as tess
- from PIL import Image
-
- tess.pytesseract.tesseract_cmd=r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
- pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
- with open("demo.pdf","w+b",) as f:
- f.write(pdf)
-
- from pdfminer.high_level import extract_text
- text = extract_text('demo.pdf')
- # doc = DocumentFile.from_images(found)
- # result = model(doc)
- # text = result.render()
-
- # from pdfminer.high_level import extract_text
- # txt = extract_text('demo.pdf')
- else:
- from pdfminer.high_level import extract_text
- text = extract_text(fname)
-
- sentence = Sentence(text)
-
- # predict NER tags
- tagger.predict(sentence)
-
- # print sentence
- ko = (sentence)
-
- ko1 = str(ko).split("→")
- import pandas as pd
-
- dfg = []
- try:
- s = ko1[1].replace("", "").replace("", "").replace("/", ":")
-
- # os.remove(found)
- # return 'Invalid image'
- dfg.append(s)
- df = pd.DataFrame(dfg)
- df = df[0]
-
- df.to_csv("df.csv", index=False)
-
- df1 = pd.read_csv("df.csv")
- ve = df1["0"].str.split(",")
- fgf = ve.to_list()
- dfgh = pd.DataFrame(fgf[0])
- maindf = dfgh[0] # .str.split(":")
- # maindf.to_csv("main.csv")
-
- main1 = maindf.to_list()
- main1
- # cv=pd.DataFrame(ve)
- # cv
- per = ["PER"]
- org = ["ORG"]
- loc = ["LOC"]
- organizations = [i for i in main1 for j in org if j in i]
- PErsons = [i for i in main1 for j in per if j in i]
- location = [i for i in main1 for j in loc if j in i]
- except IndexError:
- pass
-
- # ************************************* ORGANIZATION ********************************************************************
-
- def organisation():
- print('organisation working ')
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com', ''))) < 4:
- pass
-
-
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https',
- '').replace(
- 'http', '').replace(":", "").replace("/", "").upper()
- print(match)
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
- '') + " /" + \
- organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com',
- '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
- '').replace(
- '"',
- '').replace(
- '.com', '').replace('.in', ''))) < 4:
- pass
-
- else:
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
- '').replace(
- 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
- '[',
- '').replace(
- ']', '').replace(
- '.com', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- try:
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
- '').upper()
- final.append("OrganizationName--" + match)
- # remove_list.append(match)
- except IndexError:
- company()
-
- #################################################company Name########################################
-
- def company():
- print('company list working')
- import re
-
- new = []
- with open('test.txt', 'r+') as f:
- flag = False
- for line in f:
- line = line.upper()
- matches = re.findall(
- r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
- line)
-
- for i in matches:
- if i in line:
- flag = True
- if flag:
- o = "OrganizationName--" + line
- new.append(o)
- # if line.startswith('\n'):
- # flag = False
- try:
- a = new[0].replace('\n', '')
- final.append(a)
- except IndexError:
- final.append("OrganizationName--")
-
- # ************************************* CONTACT PERSON *******************************************************************
- def contactpersonname():
- print('contactpersonname working')
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
- "]",
- "") + '/' +
- PErsons[
- 1].replace(":PER", "").replace('"', ''))
- except IndexError:
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
- "").replace(
- '"', ''))
- except IndexError:
- final.append("CONTACTPERSONNAME--")
-
- def image_to_text():
-
- # doc = DocumentFile.from_images(found)
- # result = model(doc)
- # image_to_text.txt = result.render()
-
- # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
- # img = Image.open(found)
- # text = tess.image_to_string(img)
- # image_to_text.txt = text
- # print(text)
- import cv2
- img_path = found
- img = cv2.imread(img_path)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- cv2.imwrite(str(found), img)
-
- result = ocr.ocr(img_path, cls=True)
- result = result[0]
-
- txts = [line[1][0] for line in result]
-
- image_to_text.txt = ""
- for i in txts:
- if len(i) < 4:
- continue
- # print(i+"\n")
- image_to_text.txt = image_to_text.txt + str(i) + "\n"
- # print(image_to_text.txt)
-
- def pdf_to_text():
-
- from pdfminer.high_level import extract_text
- pdf_to_text.txt = extract_text(found)
- # pdf_to_text.txt= text.replace('\n', ' ')
-
- extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
-
- if extension in extensionlist:
- print('image' + extension)
- image_to_text()
- x = image_to_text.txt
-
- else:
- print('pdf' + extension)
- pdf_to_text()
- x = pdf_to_text.txt
-
- verticaltext = x
- htext = x
- # print('------------------------------------------------')
- print(
- '############################################################# this is verticaltext #################################################################')
- print(verticaltext)
- htext = htext.replace('\n', ' ')
- print(
- '############################################################# this is htext #############################################################')
- print(htext)
- y = x.replace('\n', ',')
- y = y.replace(' ', ' ')
- # y = y.replace(".", " .")
- horizontaltext = y
- # print('------------------------------------------------')
- print(
- '############################################################# this is horizontaltext #############################################################')
- print(horizontaltext)
-
- textfile = open("test123456.txt", "w")
- a = textfile.write(verticaltext)
- textfile.close()
- textfile = open("vtext.txt", "w")
- a = textfile.write(horizontaltext)
- textfile.close()
- with open('test123456.txt', 'r') as f:
- with open('test.txt', 'w') as w:
- for line in f:
- if line.strip().replace('|', ''):
- w.write(line)
-
- ###########################ADDRESS##################################
- addrespinlst = []
-
- def splitaddress():
- import re
- textaddress = htext.replace('\n', ' ')
- # print(textaddress)
-
- address1 = (textaddress.partition(",")[0])
- words = address1.split()
- address1 = words[-1]
- addre = (htext.partition(",")[2])
- a = addre.replace('\n', ' ').replace('\x0c', '')
- addre = (a.partition(",")[2])
- matches = re.findall(
- r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
- a)
- for match in matches:
- address2 = match
- address2 = str(address2)
- address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
- '')
-
- matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
- for address3 in matches:
- pass
- try:
- Address = address1 + "," + address2 + "," + address3
- final.append('ADDRESS--' + Address)
- addrespinlst.append(Address)
-
- except NameError:
-
- print(
- '############################################################ Addressmodelworking #############################################################')
-
- # doc = nlp_model1(textaddress)
- # addlist = []
- # for ent in doc.ents:
- # name = (f'{ent.label_.upper():{10}}--{ent.text}')
- # addlist.append(name)
- # try:
- # Address = addlist[0]
- # final.append(Address)
- # addrespinlst.append(Address)
- # remove_list.append(
- # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
- # "ADDRESS--",
- # ""))
- # except IndexError:
- # final.append("ADDRESS--")
- pass
- ################################################## website#######################################################
-
- # import re
-
- # url = []
- # matches = re.findall(r'www.*', verticaltext)
- # for match in matches:
- # if (match.count('.')) == 1:
- # a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- # url.append(a_string1)
- # else:
-
- # final.append("Urls--" + match)
-
- # if len(url)==0:
-
- # from urlextract import URLExtract
-
- # extractor = URLExtract()
- # urls = extractor.find_urls(verticaltext)
- # try:
- # urllist = urls[0]
- # final.append("Urls--"+urllist)
- # url.append(urllist)
- # except IndexError:
- # final.append("Urls--")
-
- # for match in matches:
- # if (match.count('.')) == 1:
- # a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- # url.append(a_string1)
- # else:
-
- # final.append("Urls--" + match)
- # url.append(match)
- # remove_list.append(match)
- # else:
- # final.append("Urls--" )
-
- ################################################## website#######################################################
-
- import re
- # final=[]
- url = []
- urlfinal = []
- matches = re.findall(r'www.*', verticaltext)
- for match in matches:
-
- if (match.count('.')) == 1:
- a_string1 = match.replace("www", "www.")
-
- # final.append("Urls--" + a_string1)
- url.append(a_string1)
- else:
-
- url.append(match)
-
- if len(url) == 0:
-
- from urlextract import URLExtract
-
- extractor = URLExtract()
- urls = extractor.find_urls(verticaltext)
- try:
- urllist = urls[0]
- url.append(urllist)
- url.append(urllist)
- except IndexError:
- pass
-
- for match in matches:
- if (match.count('.')) == 1:
- a_string1 = match.replace("www", "www.")
-
- url.append(a_string1)
- # url.append(a_string1)
- else:
-
- url.append(match)
- url.append(match)
-
- else:
- pass
- try:
- test_string = url[0]
-
- test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
-
- res = [ele for ele in test_list if (ele in test_string)]
-
- if len(res) == 0:
- print('no match')
-
- final.append('urls--')
-
-
- else:
- print('matched')
- final.append('urls--' + url[0])
- urlfinal.append(url[0])
-
-
- except IndexError:
- final.append('urls--')
-
- print(
- '############################################################# url #############################################################')
- print(url)
- #######organisation and contact################
-
- # def company_url():
- # # print('--url--')
- # # print(url)
-
- # try:
- # match = str(url[0]).lower()
- # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
- # final.append("OrganizationName--" + match)
- # # remove_list.append(match)
- # except IndexError:
- # org_name()
- # organisation()
- # final.append("OrganizationName--")
-
- # make example sentence
-
- # print(horizontaltext)
- sentence = Sentence(verticaltext)
-
- # predict NER tags
- tagger.predict(sentence)
-
- # print sentence
- ko = (sentence)
-
- ko1 = str(ko).split("→")
- import pandas as pd
-
- dfg = []
- try:
- s = ko1[1].replace("", "").replace("", "").replace("/", ":")
- except IndexError:
- os.remove(found)
- return 'Invalid image'
- dfg.append(s)
- df = pd.DataFrame(dfg)
- df = df[0]
-
- df.to_csv("df.csv", index=False)
-
- df1 = pd.read_csv("df.csv")
- ve = df1["0"].str.split(",")
- fgf = ve.to_list()
- dfgh = pd.DataFrame(fgf[0])
- maindf = dfgh[0] # .str.split(":")
- # maindf.to_csv("main.csv")
-
- main1 = maindf.to_list()
- main1
- # cv=pd.DataFrame(ve)
- # cv
- per = ["PER"]
- org = ["ORG"]
- loc = ["LOC"]
- organizations = [i for i in main1 for j in org if j in i]
- PErsons = [i for i in main1 for j in per if j in i]
- location = [i for i in main1 for j in loc if j in i]
-
- # ************************************* ORGANIZATION ********************************************************************
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- ']', '').replace(
- '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
- pass
- # company_url()
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https',
- '').replace(
- 'http', '').replace(":", "").replace("/", "").upper()
- print(match)
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
- '.com', '') + " /" + \
- organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
- '').replace(
- '.com', '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
-
-
- except IndexError:
- try:
- if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
- '').replace(
- '"',
- '').replace(
- '.com', ''))) < 4:
- pass
- # company_url()
- else:
-
- match = str(urlfinal[0]).lower()
- match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
- 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
-
- s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
- '').replace(
- '.com', '')
- s1 = s1g.upper()
- s2 = match.upper()
- from difflib import SequenceMatcher
- print(s1)
- print(s2)
- print(SequenceMatcher(None, s1, s2).ratio())
- if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
- # and SequenceMatcher(None, s1, s2).ratio()<0.50:
- final.append(
- "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
- '').replace(
- ']', '').replace(
- '.com', '').replace(']', ''))
- else:
- final.append("OrganizationName--" + s2)
-
- except IndexError:
- org_name()
- organisation()
-
- # final.append("OrganizationName--")
-
- # ************************************* CONTACT PERSON *******************************************************************
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
- "") +
- PErsons[
- 1].replace(":PER", "").replace('"', ''))
- except IndexError:
- try:
- final.append(
- "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
- '"',
- ''))
- except IndexError:
- org_name()
- contactpersonname()
- # final.append("CONTACTPERSONNAME--")
- ###############address flair#####################
-
- try:
- print(
- '############################################################# address new code #############################################################')
- loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
- loclst = [i for i in loactionlst if i in htext.lower()]
-
- textaddress = htext
- textaddress = textaddress.replace("|", ",")
- textaddress = textaddress.lower()
-
- nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
- grop = nlp(textaddress)
-
- citycountry = []
- print('########################### city or country name ###########################')
- d = grop[-1]
-
- if d['entity_group'] == "COUNTRY":
- print(d["word"])
- citycountry.append(d["word"])
- elif d['entity_group'] == "CITY":
- print(d["word"])
- citycountry.append(d["word"])
-
- try:
- address1 = loclst[0]
- except IndexError:
- address1 = (textaddress.partition(",")[0])
- words = address1.split()
- address1 = words[-1]
-
- star_location = address1.lower()
- end_location = citycountry[0].replace("#", "")
- start = star_location
- end = end_location
- s = textaddress.lower()
- middle_address = (s.split(start))[-1].split(end)[0]
- Address = start + middle_address + end
- Address = Address.replace('--', '').title()
- print(Address)
- if Address.count(',') < 2:
- splitaddress()
- else:
- final.append('ADDRESS--' + Address)
-
- # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
- # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
- # d1 = star_location.split()
- # d2 = end_location.split()
- # d3 = d1[0]
- # d4 = d2[0]
- # start = d3
- # end = d4
- # s = horizontaltext
- # middle_address = ((s.split(start))[1].split(end)[0])
- # Address = d3 + middle_address + d4
- # final.append('ADDRESS--' + Address)
- # addrespinlst.append(Address)
-
-
- except IndexError:
- splitaddress()
-
- ########################################## Designation ###########################################
- import re
- new = []
- with open('test.txt', 'r') as f:
- flag = False
- for line in f:
- line1 = line
- line = line.upper()
- matches = re.findall(
- r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
- line)
- for match in matches:
- line = line.replace('-', '')
- # print(line)
- o = "Designation--" + line
- new.append(o)
- remove_list.append(str(line1).replace('\n', ''))
-
- try:
- a = new[0].replace('\n', '')
- final.append(a)
-
- except IndexError:
- final.append("Designation--")
-
- ###################################################Phone number#################################################
- num = []
- import phonenumbers
-
- # print(verticaltext)
- numbers = phonenumbers.PhoneNumberMatcher(
- verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
-
- for number in numbers:
- number = str(number).split(")")
- num.append(number[1])
- # num.append(number[-1])
- if len(num) == 0:
- final.append("ContactNumber--")
- final.append("OrganizationNumber--")
- elif len(num) > 1:
- final.append("ContactNumber--" + num[0].replace(' ', ''))
- final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
- elif len(num) == 1:
- try:
- final.append("ContactNumber--" + num[0].replace(' ', ''))
- final.append("OrganizationNumber--")
- except IndexError:
- final.append("ContactNumber--")
- final.append("OrganizationNumber--")
- print(
- '############################################################# num #############################################################')
- print(num)
- # try:
- # final.append("PhoneNumber--" + num[0].replace(' ', ''))
- # remove_list.append(num[0])
- # except IndexError:
- # pass
- # try:
- # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
- # remove_list.append(num[1])
- # except IndexError:
- # pass
- # try:
- # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
- # remove_list.append(num[2])
- # except IndexError:
- # pass
-
- ################################################### Email######################################################
- import re
- from email_scraper import scrape_emails
- s = list(scrape_emails(horizontaltext))
- email_id = s
-
- # email_id = []
- # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
- # for match in matches:
- # email_id.append(match)
-
- # # final.append('Email--' + match)
- # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
- # # final.append(email_)
-
- # # final.append('Email--' + email_)
- # # remove_list.append(email_)
- if len(email_id) > 1:
- final.append(
- 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
- ""))
- final.append(
- 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
- "'",
- ""))
- else:
- try:
- final.append(
- 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
- "'",
- ""))
- final.append('OrganizationEmail--')
- except IndexError:
- final.append('ContactEmail--')
- final.append('OrganizationEmail--')
-
- ###############PINCODE############
-
- pinlst = []
- print(addrespinlst)
- import pgeocode
-
- # try:
- # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
- # for i in matche1:
- # address3 = i.replace(' ', '').replace('-', '')
- # pinlst.append(address3)
- # except IndexError:
-
- lst = []
- for i in num:
- i = i[1:]
- lst.append(i)
-
- infile = r"vtext.txt"
- outfile = r"cleaned_file.txt"
- import glob
- delete_list = lst
- # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
- fin = open(infile, "r+")
- fout = open(outfile, "w+")
- for line12 in fin:
- for word in delete_list:
- line12 = line12.replace(word, "")
-
- fout.write(line12)
- fin.close()
- # print(line)
-
- # print(addrespinlst)
- import pgeocode
- print(line12)
- import re
- matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
- for i in matche1:
- address3 = i.replace(' ', '').replace('-', '')
- pinlst.append(address3)
-
- nomi = pgeocode.Nominatim('IN')
- try:
- a = nomi.query_postal_code(str(pinlst[-1]))
- # print(a)
- b = a.keys()
- c = b.values.tolist()
- d = a.tolist()
- postal_code = "PinCode1" + "--" + d[0]
- final.append(postal_code)
- country_code = c[1] + "--" + str(d[1])
- final.append(country_code)
- place_name = 'LandMark1' + "--" + str(d[2])
- final.append(place_name)
- state_name = c[3] + "--" + str(d[3])
- final.append(state_name)
- state_code = c[4] + "--" + str(d[4])
- final.append(state_code)
- county_name = 'CityName1' + "--" + str(d[5])
- final.append(county_name)
-
- except (IndexError, NameError):
- final.append("PinCode1--")
- final.append("country_code--")
- final.append("LandMark1--")
- final.append("state_name--")
- final.append("state_code--")
- final.append("CityName1--")
-
-
- ######################################################## json #####################################################################
-
- import pandas as pd
- df = pd.DataFrame(final)
- df1 = df[0].str.split('--', expand=True)
- # print(df1)
- df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
- df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
- df1['Keys']=df1['Keys'].str.strip()
- df1.to_csv('path123.csv', index=False)
- df2 = pd.read_csv('path123.csv')
- print(df2)
- df2 = df2.T
- df2.to_csv('path1.csv', index=False, header=False)
- df1 = pd.read_csv('path1.csv')
- df1.to_json('firstjson1.json', orient="index")
- import json
- with open('firstjson1.json', 'r') as json_file:
- json_load = json.load(json_file)
- # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
- nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
- # # print('--------------------------------------------------------------------------')
- # # print(nothing)
- empty = []
- import base64
- name = found
- image = open(name, 'rb')
- image_read = image.read()
- image_64_encode = base64.b64encode(image_read)
- NULL = 'null'
- empty.append("ByteData--" + (NULL).strip('""'))
- image_64_encode = image_64_encode.decode('utf-8')
- empty.append("FileData--" + str(image_64_encode))
- imagedata = name.split("/")
- imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
- imagename1 = str(imagename).split('.')
- imagename = str(imagename1[-2]).replace("[", "]")
- empty.append("FileName--" + imagename)
- empty.append("FilePath--" + found)
- imageExtension = str(imagename1[-1]).replace("[", "]")
- empty.append("FileType--" + imageExtension)
- image.close()
- import pandas as pd
- df = pd.DataFrame(empty)
- df = df[0].str.split("--", expand=True)
- data1 = pd.DataFrame(df[0])
- data2 = pd.DataFrame(df[1])
- dt = data2.set_index(data1[0])
- dt4 = dt.T
- dictionary = dt4.to_dict(orient="index")
- list1 = []
- # list.append(a)
- list1.append(dictionary[1])
- # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
- print('--------------------')
- # print(namelist)
- import json
- # JSON data:
- x = nothing
- # python object to be appended
- y = {"image": dictionary[1]}
- # parsing JSON string:
- z = json.loads(x)
- # appending the data
- z.update(y)
- # the result is a JSON string:
- # print(json.dumps(z))
-
- #############################################creating csv#####################################
- # print(final)
- # print(imagelist)
- # final.append('image--'+str(imagelist))
- # import requests
- # import json
- # # with open('visitingcard1.json', 'r') as json_file:
- # # json_load = json.load(json_file)
- # url = "https://demo.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create"
- url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create"
- # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
- payload1 = json.dumps(z)
- # print('--------------------------------------------------------------------------')
- # print(payload1)
- headers = {
- 'Authorization': 'stat db226c95fae04943aa3e3c03a4381b2a',
- # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
- # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
- 'Content-Type': 'application/json'
- }
- response = requests.request("POST", url, headers=headers, data=payload1)
- # print("##############################################################")
-
- # #print(payload1)
- print(response.text)
- import os
- if 'BusinessCards Created Successfully' in response.text:
- print('present')
- os.remove(found)
- else:
- print('not present')
-
- # df1.to_json('visitingcard.json')
- # data = df1.to_json('visiting.json', orient='records')
- # print(data)
-
- # return render_template('index.html')
- # files = glob.glob('./upload/*')
- # for f in files:
- # os.remove(f)
-
- # print('Time Taken:',total)
-
-
-
- return response.text
-
- # return 'done'
-
-
- # # return send_file(p,as_attachment=True)
- # @app.route('/upload_BusinessCards', methods=["POST"])
- # def upload_BusinessCards():
- # if __name__ == "__main__":
- # url_list = []
- # Dataset = request.get_json()
- # print("8888888888888888888888888888888888888888888888888888888888888888888888888888888888")
- # #print(Dataset)
- # # id = "100013660000125"
- # url_list.append(Dataset)
- # # multiprocessing
-
- # with multiprocessing.Pool(processes=1) as pool:
- # # try:
- # results = pool.map(predict, url_list)
- # # except IndexError:
- # # return 'Invalid image'
- # # results.clear()
- # # a=results[0]
-
- # pool.close()
-
- # return results[0]
-
- @app.route('/upload_BusinessCards', methods=["POST"])
- def mainfunction():
- Dataset = request.get_json()
- if len(Dataset)==1:
- # predict(Dataset)
- return predict(Dataset)
- else:
- # multiplecards(Dataset)
- return multiplecards(Dataset)
-
-
- if __name__ == "__main__":
- app.run(host='0.0.0.0',port=1112)
-
-
-
|