from flask import Flask, render_template, request, redirect, Response, send_file import os # import openai import requests import pandas as pd import pgeocode from email_scraper import scrape_emails import phonenumbers from pdfminer.high_level import extract_text import pytesseract import time import multiprocessing from PIL import Image from functools import partial from urlextract import URLExtract import pytesseract as tess from PIL import Image import os import glob from pytesseract import * import shutil import cv2 import matplotlib from werkzeug.utils import secure_filename import requests #import spacy import time import multiprocessing from PIL import Image from functools import partial import pandas as pd ################################################################ Current_Working_Directory=os.getcwd() Current_Working_Directory=Current_Working_Directory.replace("\\","/") # nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p") ################################################################ # import spacy # nlp_model1 = spacy.load('./ADD3001.2') from flair.data import Sentence from flair.models import SequenceTagger from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner") model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner") from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True) tagger = SequenceTagger.load("flair/ner-english-large") import datetime app = Flask(__name__) # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/" @app.route('/', methods=['GET']) def card(): return render_template('card.html') @app.route('/upload_BusinessCards', methods=["POST"]) # @app.route('/multiplecards', methods=["POST"]) def multiplecards(): # print('################## multiple card detection #######################') # print(Dataset) from pathlib import Path Path("multicards").mkdir(exist_ok=True) datalist=[] zlist=[] Dataset = request.get_json() # print(data) #datalist.append(Dataset) data = {'visiting': Dataset} for i in data['visiting']: import time # time.sleep(1) a = i x = a['FileData'] # print(x) y = a['FileName'] z = a['FileType'] # CreatedBy=a['CreatedBy'] name = y + '.' + z # print(name) # print(y) # image = y.split("/") # filename=image[-1] # print(x) img_data = x.encode() import base64 with open('./multicards/' + name, "wb") as fh: fh.write(base64.decodebytes(img_data)) # print(i) # import os # import glob # for i in glob.glob('./multipleupload/*'): found = './multicards/' + name print(found) extension = found.split('.')[-1] # for root, dirs, fils in os.glob('./multipleupload'): # for name in files: # foundfile= os.path.join(root, name) # print(foundfile) import re import csv import glob import os # import pytesseract # import cv2 import numpy as np import glob import os import cv2 import requests final = [] # final.append('assignto--'+CreatedBy) imagelist = [] # print(found) remove_list = [] import os import glob import pdfminer # import os # ts = 0 # for file_name in glob.glob('./upload/*'): # fts = os.path.getmtime(file_name) # if fts > ts: # ts = fts # found = file_name # print(found) # print(extension) def org_name(): print('org_name is working') import pytesseract fname = found if extension != 'pdf': img = cv2.imread(fname) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite(str(found), img) from PIL import Image im = Image.open(found) im.save("images1.png", dpi=(1200, 1200)) # import pytesseract fname = "images1.png" import pytesseract as tess from PIL import Image tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf") with open("demo.pdf", "w+b", ) as f: f.write(pdf) from pdfminer.high_level import extract_text text = extract_text('demo.pdf') # doc = DocumentFile.from_images(found) # result = model(doc) # text = result.render() # from pdfminer.high_level import extract_text # txt = extract_text('demo.pdf') else: from pdfminer.high_level import extract_text text = extract_text(fname) sentence = Sentence(text) # predict NER tags tagger.predict(sentence) # print sentence ko = (sentence) ko1 = str(ko).split("→") import pandas as pd dfg = [] try: s = ko1[1].replace("", "").replace("", "").replace("/", ":") # os.remove(found) # return 'Invalid image' dfg.append(s) df = pd.DataFrame(dfg) df = df[0] df.to_csv("df.csv", index=False) df1 = pd.read_csv("df.csv") ve = df1["0"].str.split(",") fgf = ve.to_list() dfgh = pd.DataFrame(fgf[0]) maindf = dfgh[0] # .str.split(":") # maindf.to_csv("main.csv") main1 = maindf.to_list() main1 # cv=pd.DataFrame(ve) # cv per = ["PER"] org = ["ORG"] loc = ["LOC"] organizations = [i for i in main1 for j in org if j in i] PErsons = [i for i in main1 for j in per if j in i] location = [i for i in main1 for j in loc if j in i] except IndexError: pass # ************************************* ORGANIZATION ******************************************************************** def organisation(): print('organisation working ') try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', ''))) < 4: pass else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace( 'http', '').replace(":", "").replace("/", "").upper() print(match) s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') + " /" + \ organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', '').replace( '"', '').replace( '.com', '').replace('.in', ''))) < 4: pass else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace( '[', '').replace( ']', '').replace( '.com', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').upper() final.append("OrganizationName--" + match) # remove_list.append(match) except IndexError: company() #################################################company Name######################################## def company(): print('company list working') import re new = [] with open('test.txt', 'r+') as f: flag = False for line in f: line = line.upper() matches = re.findall( r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''', line) for i in matches: if i in line: flag = True if flag: o = "OrganizationName--" + line new.append(o) # if line.startswith('\n'): # flag = False try: a = new[0].replace('\n', '') final.append(a) except IndexError: final.append("OrganizationName--") # ************************************* CONTACT PERSON ******************************************************************* def contactpersonname(): print('contactpersonname working') try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace( "]", "") + '/' + PErsons[ 1].replace(":PER", "").replace('"', '')) except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( '"', '')) except IndexError: final.append("CONTACTPERSONNAME--") def image_to_text(): # doc = DocumentFile.from_images(found) # result = model(doc) # image_to_text.txt = result.render() # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" # img = Image.open(found) # text = tess.image_to_string(img) # image_to_text.txt = text # print(text) import cv2 img_path = found img = cv2.imread(img_path) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite(str(found), img) result = ocr.ocr(img_path, cls=True) result = result[0] txts = [line[1][0] for line in result] image_to_text.txt = "" for i in txts: if len(i) < 4: continue # print(i+"\n") image_to_text.txt = image_to_text.txt + str(i) + "\n" # print(image_to_text.txt) def pdf_to_text(): from pdfminer.high_level import extract_text pdf_to_text.txt = extract_text(found) # pdf_to_text.txt= text.replace('\n', ' ') extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] if extension in extensionlist: print('image' + extension) image_to_text() x = image_to_text.txt else: print('pdf' + extension) pdf_to_text() x = pdf_to_text.txt verticaltext = x htext = x # print('------------------------------------------------') #print('############################################################# this is verticaltext #################################################################') # print(verticaltext) htext = htext.replace('\n', ' ') # print('############################################################# this is htext #############################################################') #print(htext) y = x.replace('\n', ',') y = y.replace(' ', ' ') # y = y.replace(".", " .") horizontaltext = y # print('------------------------------------------------') #print('############################################################# this is horizontaltext #############################################################') #print(horizontaltext) textfile = open("test123456.txt", "w") a = textfile.write(verticaltext) textfile.close() textfile = open("vtext.txt", "w") a = textfile.write(horizontaltext) textfile.close() with open('test123456.txt', 'r') as f: with open('test.txt', 'w') as w: for line in f: if line.strip().replace('|', ''): w.write(line) ###########################ADDRESS################################## addrespinlst = [] def splitaddress(): import re textaddress = htext.replace('\n', ' ') # print(textaddress) address1 = (textaddress.partition(",")[0]) words = address1.split() address1 = words[-1] addre = (htext.partition(",")[2]) a = addre.replace('\n', ' ').replace('\x0c', '') addre = (a.partition(",")[2]) matches = re.findall( r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b', a) for match in matches: address2 = match address2 = str(address2) address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', '') matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) for address3 in matches: pass try: Address = address1 + "," + address2 + "," + address3 final.append('ADDRESS--' + Address) addrespinlst.append(Address) except NameError: print( '############################################################ Addressmodelworking #############################################################') # doc = nlp_model1(textaddress) # addlist = [] # for ent in doc.ents: # name = (f'{ent.label_.upper():{10}}--{ent.text}') # addlist.append(name) # try: # Address = addlist[0] # final.append(Address) # addrespinlst.append(Address) # remove_list.append( # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace( # "ADDRESS--", # "")) # except IndexError: # final.append("ADDRESS--") pass ################################################## website####################################################### # import re # url = [] # matches = re.findall(r'www.*', verticaltext) # for match in matches: # if (match.count('.')) == 1: # a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) # url.append(a_string1) # else: # final.append("Urls--" + match) # if len(url)==0: # from urlextract import URLExtract # extractor = URLExtract() # urls = extractor.find_urls(verticaltext) # try: # urllist = urls[0] # final.append("Urls--"+urllist) # url.append(urllist) # except IndexError: # final.append("Urls--") # for match in matches: # if (match.count('.')) == 1: # a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) # url.append(a_string1) # else: # final.append("Urls--" + match) # url.append(match) # remove_list.append(match) # else: # final.append("Urls--" ) ################################################## website####################################################### import re # final=[] url = [] urlfinal = [] matches = re.findall(r'www.*', verticaltext) for match in matches: if (match.count('.')) == 1: a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) url.append(a_string1) else: url.append(match) if len(url) == 0: from urlextract import URLExtract extractor = URLExtract() urls = extractor.find_urls(verticaltext) try: urllist = urls[0] url.append(urllist) url.append(urllist) except IndexError: pass for match in matches: if (match.count('.')) == 1: a_string1 = match.replace("www", "www.") url.append(a_string1) # url.append(a_string1) else: url.append(match) url.append(match) else: pass try: test_string = url[0] test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"] res = [ele for ele in test_list if (ele in test_string)] if len(res) == 0: print('no match') final.append('urls--') else: print('matched') final.append('urls--' + url[0]) urlfinal.append(url[0]) except IndexError: final.append('urls--') print( '############################################################# url #############################################################') print(url) #######organisation and contact################ # def company_url(): # # print('--url--') # # print(url) # try: # match = str(url[0]).lower() # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper() # final.append("OrganizationName--" + match) # # remove_list.append(match) # except IndexError: # org_name() # organisation() # final.append("OrganizationName--") # make example sentence # print(horizontaltext) sentence = Sentence(verticaltext) # predict NER tags tagger.predict(sentence) # print sentence ko = (sentence) ko1 = str(ko).split("→") import pandas as pd dfg = [] try: s = ko1[1].replace("", "").replace("", "").replace("/", ":") except IndexError: os.remove(found) return 'Invalid image' dfg.append(s) df = pd.DataFrame(dfg) df = df[0] df.to_csv("df.csv", index=False) df1 = pd.read_csv("df.csv") ve = df1["0"].str.split(",") fgf = ve.to_list() dfgh = pd.DataFrame(fgf[0]) maindf = dfgh[0] # .str.split(":") # maindf.to_csv("main.csv") main1 = maindf.to_list() main1 # cv=pd.DataFrame(ve) # cv per = ["PER"] org = ["ORG"] loc = ["LOC"] organizations = [i for i in main1 for j in org if j in i] PErsons = [i for i in main1 for j in per if j in i] location = [i for i in main1 for j in loc if j in i] # ************************************* ORGANIZATION ******************************************************************** try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4: pass # company_url() else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace( 'http', '').replace(":", "").replace("/", "").upper() print(match) s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( '.com', '') + " /" + \ organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', '').replace( '"', '').replace( '.com', ''))) < 4: pass # company_url() else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( '.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: org_name() organisation() # final.append("OrganizationName--") # ************************************* CONTACT PERSON ******************************************************************* try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", "") + PErsons[ 1].replace(":PER", "").replace('"', '')) except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( '"', '')) except IndexError: org_name() contactpersonname() # final.append("CONTACTPERSONNAME--") ###############address flair##################### try: print( '############################################################# address new code #############################################################') loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat'] loclst = [i for i in loactionlst if i in htext.lower()] textaddress = htext textaddress = textaddress.replace("|", ",") textaddress = textaddress.lower() nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple") grop = nlp(textaddress) citycountry = [] print('########################### city or country name ###########################') d = grop[-1] if d['entity_group'] == "COUNTRY": print(d["word"]) citycountry.append(d["word"]) elif d['entity_group'] == "CITY": print(d["word"]) citycountry.append(d["word"]) try: address1 = loclst[0] except IndexError: address1 = (textaddress.partition(",")[0]) words = address1.split() address1 = words[-1] star_location = address1.lower() end_location = citycountry[0].replace("#", "") start = star_location end = end_location s = textaddress.lower() middle_address = (s.split(start))[-1].split(end)[0] Address = start + middle_address + end Address = Address.replace('--', '').title() print(Address) if Address.count(',') < 2: splitaddress() else: final.append('ADDRESS--' + Address) # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '') # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '') # d1 = star_location.split() # d2 = end_location.split() # d3 = d1[0] # d4 = d2[0] # start = d3 # end = d4 # s = horizontaltext # middle_address = ((s.split(start))[1].split(end)[0]) # Address = d3 + middle_address + d4 # final.append('ADDRESS--' + Address) # addrespinlst.append(Address) except IndexError: splitaddress() ########################################## Designation ########################################### import re new = [] with open('test.txt', 'r') as f: flag = False for line in f: line1 = line line = line.upper() matches = re.findall( r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''', line) for match in matches: line = line.replace('-', '') # print(line) o = "Designation--" + line new.append(o) remove_list.append(str(line1).replace('\n', '')) try: a = new[0].replace('\n', '') final.append(a) except IndexError: final.append("Designation--") ###################################################Phone number################################################# num = [] import phonenumbers # print(verticaltext) numbers = phonenumbers.PhoneNumberMatcher( verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN") for number in numbers: number = str(number).split(")") num.append(number[1]) # num.append(number[-1]) if len(num) == 0: final.append("ContactNumber--") final.append("OrganizationNumber--") elif len(num) > 1: final.append("ContactNumber--" + num[0].replace(' ', '')) final.append("OrganizationNumber--" + num[-1].replace(' ', '')) elif len(num) == 1: try: final.append("ContactNumber--" + num[0].replace(' ', '')) final.append("OrganizationNumber--") except IndexError: final.append("ContactNumber--") final.append("OrganizationNumber--") print( '############################################################# num #############################################################') print(num) # try: # final.append("PhoneNumber--" + num[0].replace(' ', '')) # remove_list.append(num[0]) # except IndexError: # pass # try: # final.append("PhoneNumber1--" + num[1].replace(' ', '')) # remove_list.append(num[1]) # except IndexError: # pass # try: # final.append("PhoneNumber2--" + num[2].replace(' ', '')) # remove_list.append(num[2]) # except IndexError: # pass ################################################### Email###################################################### import re from email_scraper import scrape_emails s = list(scrape_emails(horizontaltext)) email_id = s # email_id = [] # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext) # for match in matches: # email_id.append(match) # # final.append('Email--' + match) # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "") # # final.append(email_) # # final.append('Email--' + email_) # # remove_list.append(email_) if len(email_id) > 1: final.append( 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) final.append( 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( "'", "")) else: try: final.append( 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( "'", "")) final.append('OrganizationEmail--') except IndexError: final.append('ContactEmail--') final.append('OrganizationEmail--') ###############PINCODE############ pinlst = [] print(addrespinlst) import pgeocode # try: # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0]) # for i in matche1: # address3 = i.replace(' ', '').replace('-', '') # pinlst.append(address3) # except IndexError: lst = [] for i in num: i = i[1:] lst.append(i) infile = r"vtext.txt" outfile = r"cleaned_file.txt" import glob delete_list = lst # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development'] fin = open(infile, "r+") fout = open(outfile, "w+") for line12 in fin: for word in delete_list: line12 = line12.replace(word, "") fout.write(line12) fin.close() # print(line) # print(addrespinlst) import pgeocode #print(line12) import re matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) for i in matche1: address3 = i.replace(' ', '').replace('-', '') pinlst.append(address3) nomi = pgeocode.Nominatim('IN') try: a = nomi.query_postal_code(str(pinlst[-1])) # print(a) b = a.keys() c = b.values.tolist() d = a.tolist() postal_code = "PinCode1" + "--" + d[0] final.append(postal_code) country_code = c[1] + "--" + str(d[1]) final.append(country_code) place_name = 'LandMark1' + "--" + str(d[2]) final.append(place_name) state_name = c[3] + "--" + str(d[3]) final.append(state_name) state_code = c[4] + "--" + str(d[4]) final.append(state_code) county_name = 'CityName1' + "--" + str(d[5]) final.append(county_name) except (IndexError, NameError): final.append("PinCode1--") final.append("country_code--") final.append("LandMark1--") final.append("state_name--") final.append("state_code--") final.append("CityName1--") ######################################################## json ##################################################################### import pandas as pd df = pd.DataFrame(final) df1 = df[0].str.split('--', expand=True) # print(df1) df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True) df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True) df1['Keys'] = df1['Keys'].str.strip() df1.to_csv('path123.csv', index=False) df2 = pd.read_csv('path123.csv') print(df2) df2 = df2.T df2.to_csv('path1.csv', index=False, header=False) df1 = pd.read_csv('path1.csv') df1.to_json('firstjson1.json', orient="index") import json with open('firstjson1.json', 'r') as json_file: json_load = json.load(json_file) # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') # # print('--------------------------------------------------------------------------') # # print(nothing) empty = [] import base64 name = found image = open(name, 'rb') image_read = image.read() image_64_encode = base64.b64encode(image_read) NULL = 'null' empty.append("ByteData--" + (NULL).strip('""')) image_64_encode = image_64_encode.decode('utf-8') empty.append("FileData--" + str(image_64_encode)) imagedata = name.split("/") imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") imagename1 = str(imagename).split('.') imagename = str(imagename1[-2]).replace("[", "]") empty.append("FileName--" + imagename) empty.append("FilePath--"+ "") imageExtension = str(imagename1[-1]).replace("[", "]") empty.append("FileType--" + imageExtension) image.close() import pandas as pd df = pd.DataFrame(empty) df = df[0].str.split("--", expand=True) data1 = pd.DataFrame(df[0]) data2 = pd.DataFrame(df[1]) dt = data2.set_index(data1[0]) dt4 = dt.T dictionary = dt4.to_dict(orient="index") list1 = [] # list.append(a) list1.append(dictionary[1]) # # final.append("image--"+str(dictionary[1]).replace("\'",'"')) print('--------------------') # print(namelist) import json # JSON data: x = nothing # python object to be appended y = {"image": dictionary[1]} # parsing JSON string: z = json.loads(x) # appending the data z.update(y) # the result is a JSON string: # print(json.dumps(z)) zlist.append(z) #############################################creating csv##################################### #print(final) #print(imagelist) #final.append('image--' + str(imagelist)) # import requests # import json # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create' # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 # payload1 = json.dumps(zlist) # # print('--------------------------------------------------------------------------') # #print(payload1) # headers = { # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1', # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af', # 'Content-Type': 'application/json' # } # response = requests.request("POST", url, headers=headers, data=payload1) # # print("##############################################################") # print(payload1) # #print(zlist) # # import os # # if 'BusinessCards Created Successfully' in response.text: # # print('present') # # os.remove(found) # # else: # # print('not present') # df1.to_json('visitingcard.json') # data = df1.to_json('visiting.json', orient='records') # print(data) #return render_template('index.html') #return response.text #return z return zlist if __name__ == "__main__": app.run(host='0.0.0.0', port=1112)