from flask import Flask, render_template, request, redirect, Response, send_file import os # import openai import requests import pandas as pd import pgeocode from email_scraper import scrape_emails import phonenumbers from pdfminer.high_level import extract_text import pytesseract import time import multiprocessing from PIL import Image from functools import partial from urlextract import URLExtract import pytesseract as tess from PIL import Image # from doctr.io import DocumentFile # from doctr.models import ocr_predictor # model = ocr_predictor(pretrained=True) # load tagger ###################################################### import os import glob from pytesseract import * import shutil import cv2 import matplotlib from werkzeug.utils import secure_filename import requests import spacy import time import multiprocessing from PIL import Image from functools import partial # nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME") # nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2") from flask import Flask, render_template, request, redirect, Response, send_file import pandas as pd ################################################################ Current_Working_Directory = os.getcwd() Current_Working_Directory = Current_Working_Directory.replace("\\", "/") # nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p") ################################################################ # import spacy # nlp_model1 = spacy.load('./ADD3001.2') from flair.data import Sentence from flair.models import SequenceTagger from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner") model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner") from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False) tagger = SequenceTagger.load("flair/ner-english-large") # tagger.to("cuda") import datetime app = Flask(__name__) # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/" @app.route('/', methods=['GET']) def home(): return render_template('home.html') @app.route('/resume', methods=['GET']) def resume(): return render_template('resume.html') @app.route('/invoice', methods=['GET']) def invoice(): return render_template('invoice.html') @app.route('/card', methods=['GET']) def card(): return render_template('card.html') @app.route('/upload_BusinessCards', methods=["POST"]) # @app.route('/multiplecards', methods=["POST"]) def multiplecards(): # print('################## multiple card detection #######################') # print(Dataset) datalist = [] zlist = [] Dataset = request.get_json() # print(data) # datalist.append(Dataset) data = {'visiting': Dataset} for i in data['visiting']: import time # time.sleep(1) a = i x = a['FileData'] # print(x) y = a['FileName'] z = a['FileType'] # CreatedBy=a['CreatedBy'] name = y + '.' + z # print(name) # print(y) # image = y.split("/") # filename=image[-1] # print(x) img_data = x.encode() import base64 with open('./multicards/' + name, "wb") as fh: fh.write(base64.decodebytes(img_data)) # print(i) # import os # import glob # for i in glob.glob('./multipleupload/*'): found = './multicards/' + name print(found) extension = found.split('.')[-1] # for root, dirs, fils in os.glob('./multipleupload'): # for name in files: # foundfile= os.path.join(root, name) # print(foundfile) import re import csv import glob import os # import pytesseract # import cv2 import numpy as np import glob import os import cv2 import requests final = [] # final.append('assignto--'+CreatedBy) imagelist = [] # print(found) remove_list = [] import os import glob import pdfminer # import os # ts = 0 # for file_name in glob.glob('./upload/*'): # fts = os.path.getmtime(file_name) # if fts > ts: # ts = fts # found = file_name # print(found) # print(extension) def org_name(): print('org_name is working') import pytesseract fname = found if extension != 'pdf': img = cv2.imread(fname) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite(str(found), img) from PIL import Image im = Image.open(found) im.save("images1.png", dpi=(1200, 1200)) # import pytesseract fname = "images1.png" import pytesseract as tess from PIL import Image tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf") with open("demo.pdf", "w+b", ) as f: f.write(pdf) from pdfminer.high_level import extract_text text = extract_text('demo.pdf') # doc = DocumentFile.from_images(found) # result = model(doc) # text = result.render() # from pdfminer.high_level import extract_text # txt = extract_text('demo.pdf') else: from pdfminer.high_level import extract_text text = extract_text(fname) sentence = Sentence(text) # predict NER tags tagger.predict(sentence) # print sentence ko = (sentence) ko1 = str(ko).split("→") import pandas as pd dfg = [] try: s = ko1[1].replace("", "").replace("", "").replace("/", ":") # os.remove(found) # return 'Invalid image' dfg.append(s) df = pd.DataFrame(dfg) df = df[0] df.to_csv("df.csv", index=False) df1 = pd.read_csv("df.csv") ve = df1["0"].str.split(",") fgf = ve.to_list() dfgh = pd.DataFrame(fgf[0]) maindf = dfgh[0] # .str.split(":") # maindf.to_csv("main.csv") main1 = maindf.to_list() main1 # cv=pd.DataFrame(ve) # cv per = ["PER"] org = ["ORG"] loc = ["LOC"] organizations = [i for i in main1 for j in org if j in i] PErsons = [i for i in main1 for j in per if j in i] location = [i for i in main1 for j in loc if j in i] except IndexError: pass # ************************************* ORGANIZATION ******************************************************************** def organisation(): print('organisation working ') try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', ''))) < 4: pass else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace( 'http', '').replace(":", "").replace("/", "").upper() print(match) s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') + " /" + \ organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', '').replace( '"', '').replace( '.com', '').replace('.in', ''))) < 4: pass else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace( '[', '').replace( ']', '').replace( '.com', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').upper() final.append("OrganizationName--" + match) # remove_list.append(match) except IndexError: company() #################################################company Name######################################## def company(): print('company list working') import re new = [] with open('test.txt', 'r+') as f: flag = False for line in f: line = line.upper() matches = re.findall( r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''', line) for i in matches: if i in line: flag = True if flag: o = "OrganizationName--" + line new.append(o) # if line.startswith('\n'): # flag = False try: a = new[0].replace('\n', '') final.append(a) except IndexError: final.append("OrganizationName--") # ************************************* CONTACT PERSON ******************************************************************* def contactpersonname(): print('contactpersonname working') try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace( "]", "") + '/' + PErsons[ 1].replace(":PER", "").replace('"', '')) except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( '"', '')) except IndexError: final.append("CONTACTPERSONNAME--") def image_to_text(): # doc = DocumentFile.from_images(found) # result = model(doc) # image_to_text.txt = result.render() # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" # img = Image.open(found) # text = tess.image_to_string(img) # image_to_text.txt = text # print(text) import cv2 img_path = found img = cv2.imread(img_path) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite(str(found), img) result = ocr.ocr(img_path, cls=True) result = result[0] txts = [line[1][0] for line in result] image_to_text.txt = "" for i in txts: if len(i) < 4: continue # print(i+"\n") image_to_text.txt = image_to_text.txt + str(i) + "\n" # print(image_to_text.txt) def pdf_to_text(): from pdfminer.high_level import extract_text pdf_to_text.txt = extract_text(found) # pdf_to_text.txt= text.replace('\n', ' ') extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] if extension in extensionlist: print('image' + extension) image_to_text() x = image_to_text.txt else: print('pdf' + extension) pdf_to_text() x = pdf_to_text.txt verticaltext = x htext = x # print('------------------------------------------------') # print('############################################################# this is verticaltext #################################################################') print(verticaltext) htext = htext.replace('\n', ' ') # print('############################################################# this is htext #############################################################') # print(htext) y = x.replace('\n', ',') y = y.replace(' ', ' ') # y = y.replace(".", " .") horizontaltext = y # print('------------------------------------------------') # print('############################################################# this is horizontaltext #############################################################') # print(horizontaltext) textfile = open("test123456.txt", "w") a = textfile.write(verticaltext) textfile.close() textfile = open("vtext.txt", "w") a = textfile.write(horizontaltext) textfile.close() with open('test123456.txt', 'r') as f: with open('test.txt', 'w') as w: for line in f: if line.strip().replace('|', ''): w.write(line) ###########################ADDRESS################################## addrespinlst = [] def splitaddress(): import re textaddress = htext.replace('\n', ' ') # print(textaddress) address1 = (textaddress.partition(",")[0]) words = address1.split() address1 = words[-1] addre = (htext.partition(",")[2]) a = addre.replace('\n', ' ').replace('\x0c', '') addre = (a.partition(",")[2]) matches = re.findall( r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b', a) for match in matches: address2 = match address2 = str(address2) address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', '') matches = re.findall(r'-\d{6}\b|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) for address3 in matches: pass try: Address = address1 + "," + address2 + "," + address3 final.append('ADDRESS--' + Address) addrespinlst.append(Address) except NameError: final.append('ADDRESS--') # print('############################################################ Addressmodelworking #############################################################') # doc = nlp_model1(textaddress) # addlist = [] # for ent in doc.ents: # name = (f'{ent.label_.upper():{10}}--{ent.text}') # addlist.append(name) # try: # Address = addlist[0] # final.append(Address) # addrespinlst.append(Address) # remove_list.append( # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace( # "ADDRESS--", # "")) # except IndexError: # final.append("ADDRESS--") pass ################################################## website####################################################### # import re # url = [] # matches = re.findall(r'www.*', verticaltext) # for match in matches: # if (match.count('.')) == 1: # a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) # url.append(a_string1) # else: # final.append("Urls--" + match) # if len(url)==0: # from urlextract import URLExtract # extractor = URLExtract() # urls = extractor.find_urls(verticaltext) # try: # urllist = urls[0] # final.append("Urls--"+urllist) # url.append(urllist) # except IndexError: # final.append("Urls--") # for match in matches: # if (match.count('.')) == 1: # a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) # url.append(a_string1) # else: # final.append("Urls--" + match) # url.append(match) # remove_list.append(match) # else: # final.append("Urls--" ) ################################################## website####################################################### import re # final=[] url = [] urlfinal = [] matches = re.findall(r'www.*', verticaltext) for match in matches: if (match.count('.')) == 1: a_string1 = match.replace("www", "www.") # final.append("Urls--" + a_string1) url.append(a_string1) else: url.append(match) if len(url) == 0: from urlextract import URLExtract extractor = URLExtract() urls = extractor.find_urls(verticaltext) try: urllist = urls[0] url.append(urllist) url.append(urllist) except IndexError: pass for match in matches: if (match.count('.')) == 1: a_string1 = match.replace("www", "www.") url.append(a_string1) # url.append(a_string1) else: url.append(match) url.append(match) else: pass try: test_string = url[0] test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"] res = [ele for ele in test_list if (ele in test_string)] if len(res) == 0: print('no match') final.append('urls--') else: print('matched') final.append('urls--' + url[0]) urlfinal.append(url[0]) except IndexError: final.append('urls--') print( '############################################################# url #############################################################') print(url) #######organisation and contact################ # def company_url(): # # print('--url--') # # print(url) # try: # match = str(url[0]).lower() # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper() # final.append("OrganizationName--" + match) # # remove_list.append(match) # except IndexError: # org_name() # organisation() # final.append("OrganizationName--") # make example sentence # print(horizontaltext) sentence = Sentence(verticaltext) # predict NER tags tagger.predict(sentence) # print sentence ko = (sentence) ko1 = str(ko).split("→") import pandas as pd dfg = [] try: s = ko1[1].replace("", "").replace("", "").replace("/", ":") except IndexError: os.remove(found) return 'Invalid image' dfg.append(s) df = pd.DataFrame(dfg) df = df[0] df.to_csv("df.csv", index=False) df1 = pd.read_csv("df.csv") ve = df1["0"].str.split(",") fgf = ve.to_list() dfgh = pd.DataFrame(fgf[0]) maindf = dfgh[0] # .str.split(":") # maindf.to_csv("main.csv") main1 = maindf.to_list() main1 # cv=pd.DataFrame(ve) # cv per = ["PER"] org = ["ORG"] loc = ["LOC"] organizations = [i for i in main1 for j in org if j in i] PErsons = [i for i in main1 for j in per if j in i] location = [i for i in main1 for j in loc if j in i] # ************************************* ORGANIZATION ******************************************************************** try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4: pass # company_url() else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace( 'http', '').replace(":", "").replace("/", "").upper() print(match) s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( '.com', '') + " /" + \ organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: try: if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']', '').replace( '"', '').replace( '.com', ''))) < 4: pass # company_url() else: match = str(urlfinal[0]).lower() match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace( 'https', '').replace('http', '').replace(":", "").replace("/", "").upper() s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace( '.com', '') s1 = s1g.upper() s2 = match.upper() from difflib import SequenceMatcher print(s1) print(s2) print(SequenceMatcher(None, s1, s2).ratio()) if SequenceMatcher(None, s1, s2).ratio() >= 0.10: # and SequenceMatcher(None, s1, s2).ratio()<0.50: final.append( "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace( ']', '').replace( '.com', '').replace(']', '')) else: final.append("OrganizationName--" + s2) except IndexError: company() # org_name() # organisation() # final.append("OrganizationName--") ################################################### Email###################################################### import re from email_scraper import scrape_emails s = list(scrape_emails(horizontaltext)) email_id1 = s import re email_id=[] # Define a function to extract email addresses from a text def extract_emails(text): email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b' return re.findall(email_pattern, text) # List of text strings # Iterate through the list and extract email addresses from each value for text in email_id1 : email_addresses = extract_emails(text) # Print the extracted email addresses if email_addresses: # print("Email addresses in the text:") for email in email_addresses: #print(email) email_id.append(email) else: print("No email addresses found in the text.") # Remove "email" if it exists within square brackets email_id = [item.replace("email", "").replace("Email", "").replace("E-mail", "") for item in email_id] # ************************************* CONTACT PERSON ******************************************************************* try: my_string='Hello' print(my_string[-6]) # final.append( # "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", # "") + # PErsons[ # 1].replace(":PER", "").replace('"', ''))+PErsons[2].replace(":PER", "").replace("[", "").replace('"', '').replace("]","") except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( '"', '')) person_name=PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace('"', '').replace(' ','') if not email_id: final.append('ContactEmail--') final.append('OrganizationEmail--') else: per_Name=[] per_Name.append(person_name) print(email_id) def calculate_matching_percentage(word_list, words): def calculate_single_matching_percentage(word, item): max_length = max(len(word), len(item)) word = word.upper() item = item.strip().replace(" ", "").upper() matching_chars = sum(1 for c1, c2 in zip(item, word) if c1 == c2) return (matching_chars / max_length) * 100 highest_percentage = 0.0 highest_matching_item = None for word in words: word = word.upper() for item in word_list: original_item = item item = item.strip().replace(" ", "").upper() matching_percentage = calculate_single_matching_percentage(word, item) if matching_percentage > highest_percentage: highest_percentage = matching_percentage highest_matching_item = original_item return highest_matching_item, highest_percentage word_list = email_id per_Name = [item.split('.')[1] if '.' in item else item for item in per_Name] print(per_Name) word2 = per_Name for word in word2: highest_matching_item, highest_percentage = calculate_matching_percentage(word_list, [word]) if highest_matching_item is not None: print( f"For '{word}', the highest matching percentage is {highest_percentage:.2f}% with '{highest_matching_item}'") else: print(f"For '{word}', no matches found.") #final.append('OrganistaionEmail--' + email_id[0]) if len(word_list) == 1: if highest_percentage >= 15: print(highest_matching_item) final.append( 'ContactEmail--' + str(highest_matching_item).replace("[", "").replace("]", "").replace( "\\n", "").replace("'", "")) final.append('OrganizationEmail--') else: print('not matched') final.append('OrganistaionEmail--' + email_id[0]) final.append('ContactEmail--') else: print('it as more elemnt') if highest_percentage >= 15: print(highest_matching_item) final.append('ContactEmail--' + str(highest_matching_item).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) # Given list of email addresses email_list = word_list # Email address to remove email_to_remove = highest_matching_item # Check if the email address is in the list before removing it if email_to_remove in email_list: email_list.remove(email_to_remove) print(f"'{email_to_remove}' has been removed from the list.") else: print(f"'{email_to_remove}' is not in the list.") # Print the updated list print("Updated email list:", email_list) final.append('OrganistaionEmail--' + str(email_list[0]).replace("[", "").replace("]", "").replace("\\n","").replace("'", "")) else: final.append('OrganistaionEmail--' + str(email_id[0]) +','+ str(email_id[1])) except IndexError: # org_name() # contactpersonname() final.append("CONTACTPERSONNAME--") if len(email_id) > 1: final.append( 'OrganizationEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) final.append( 'ContactEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( "'", "")) else: try: final.append( 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( "'", "")) final.append('OrganizationEmail--') except IndexError: final.append('ContactEmail--') final.append('OrganizationEmail--') ###############address flair##################### try: print( '############################################################# address new code #############################################################') loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat'] loclst = [i for i in loactionlst if i in htext.lower()] textaddress = htext textaddress = textaddress.replace("|", ",") textaddress = textaddress.lower() nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple") grop = nlp(textaddress) citycountry = [] print('########################### city or country name ###########################') d = grop[-1] if d['entity_group'] == "COUNTRY": print(d["word"]) citycountry.append(d["word"]) elif d['entity_group'] == "CITY": print(d["word"]) citycountry.append(d["word"]) try: address1 = loclst[0] except IndexError: address1 = (textaddress.partition(",")[0]) words = address1.split() address1 = words[-1] star_location = address1.lower() end_location = citycountry[0].replace("#", "") start = star_location end = end_location s = textaddress.lower() middle_address = (s.split(start))[-1].split(end)[0] Address = start + middle_address + end Address = Address.replace('--', '').title() print(Address) if Address.count(',') < 2: splitaddress() else: final.append('ADDRESS--' + Address) # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '') # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '') # d1 = star_location.split() # d2 = end_location.split() # d3 = d1[0] # d4 = d2[0] # start = d3 # end = d4 # s = horizontaltext # middle_address = ((s.split(start))[1].split(end)[0]) # Address = d3 + middle_address + d4 # final.append('ADDRESS--' + Address) # addrespinlst.append(Address) except IndexError: splitaddress() ########################################## Designation ########################################### import re new = [] with open('test.txt', 'r') as f: flag = False for line in f: line1 = line line = line.upper() matches = re.findall( r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''', line) for match in matches: line = line.replace('-', '') # print(line) o = "Designation--" + line new.append(o) remove_list.append(str(line1).replace('\n', '')) try: a = new[0].replace('\n', '') final.append(a) except IndexError: final.append("Designation--") ###################################################Phone number################################################# num = [] import phonenumbers # print(verticaltext) numbers = phonenumbers.PhoneNumberMatcher( verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', '').replace('-', '').replace( ' ', ''), "IN") for number in numbers: number = str(number).split(")") num.append(number[1]) # num.append(number[-1]) print(num) import re # Input list of strings # num =[' 7227906777Extn1204634444'] # Define a regular expression pattern to split when text is present pattern = r'[a-zA-Z]+' # Function to split a string based on the pattern def split_string(text): return re.split(pattern, text) # Process each line in the list split_lines = [split_string(line) for line in num] # Flatten the list of lists into a single list split_lines = [item for sublist in split_lines for item in sublist] # Remove any empty strings num = [item for item in split_lines if item] # Print the split lines print(num) if len(num) == 0: final.append("ContactNumber--") final.append("OrganizationNumber--") elif len(num) > 1: final.append("ContactNumber--" + num[0].replace(' ', '')) final.append("OrganizationNumber--" + num[-1].replace(' ', '')) elif len(num) == 1: try: final.append("ContactNumber--" + num[0].replace(' ', '')) final.append("OrganizationNumber--") except IndexError: final.append("ContactNumber--") final.append("OrganizationNumber--") print( '############################################################# num #############################################################') print(num) # try: # final.append("PhoneNumber--" + num[0].replace(' ', '')) # remove_list.append(num[0]) # except IndexError: # pass # try: # final.append("PhoneNumber1--" + num[1].replace(' ', '')) # remove_list.append(num[1]) # except IndexError: # pass # try: # final.append("PhoneNumber2--" + num[2].replace(' ', '')) # remove_list.append(num[2]) # except IndexError: # pass ###############PINCODE############ pinlst = [] print(addrespinlst) import pgeocode # try: # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0]) # for i in matche1: # address3 = i.replace(' ', '').replace('-', '') # pinlst.append(address3) # except IndexError: lst = [] for i in num: i = i[1:] lst.append(i) infile = r"vtext.txt" outfile = r"cleaned_file.txt" import glob delete_list = lst # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development'] fin = open(infile, "r+") fout = open(outfile, "w+") for line12 in fin: for word in delete_list: line12 = line12.replace(word, "") fout.write(line12) fin.close() # print(line) # print(addrespinlst) import pgeocode # print(line12) import re matche1 = re.findall(r'-\d{6}\b|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) for i in matche1: address3 = i.replace(' ', '').replace('-', '') pinlst.append(address3) nomi = pgeocode.Nominatim('IN') try: a = nomi.query_postal_code(str(pinlst[-1])) # print(a) b = a.keys() c = b.values.tolist() d = a.tolist() postal_code = "PinCode1" + "--" + d[0] final.append(postal_code) country_code = c[1] + "--" + str(d[1]) final.append(country_code) place_name = 'LandMark1' + "--" + str(d[2]) final.append(place_name) state_name = c[3] + "--" + str(d[3]) final.append(state_name) state_code = c[4] + "--" + str(d[4]) final.append(state_code) county_name = 'CityName1' + "--" + str(d[5]) final.append(county_name) except (IndexError, NameError): final.append("PinCode1--" + " ") final.append("country_code--") final.append("LandMark1--") final.append("state_name--") final.append("state_code--") final.append("CityName1--") ######################################################## json ##################################################################### import pandas as pd df = pd.DataFrame(final) df1 = df[0].str.split('--', expand=True) # print(df1) df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True) df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True) df1['Keys'] = df1['Keys'].str.strip() df1.to_csv('path123.csv', index=False) df2 = pd.read_csv('path123.csv') print(df2) if df2['Values'].isnull().all(): print("Column 'Column2' is empty.") return 'Invalid image' else: pass df2 = df2.T df2.to_csv('path1.csv', index=False, header=False) df1 = pd.read_csv('path1.csv') df1.to_json('firstjson1.json', orient="index") import json with open('firstjson1.json', 'r') as json_file: json_load = json.load(json_file) # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') # # print('--------------------------------------------------------------------------') # # print(nothing) empty = [] import base64 name = found image = open(name, 'rb') image_read = image.read() image_64_encode = base64.b64encode(image_read) NULL = 'null' empty.append("ByteData--" + (NULL).strip('""')) image_64_encode = image_64_encode.decode('utf-8') empty.append("FileData--" + str(image_64_encode)) imagedata = name.split("/") imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") imagename1 = str(imagename).split('.') imagename = str(imagename1[-2]).replace("[", "]") empty.append("FileName--" + imagename) empty.append("FilePath--" + "") imageExtension = str(imagename1[-1]).replace("[", "]") empty.append("FileType--" + imageExtension) image.close() import pandas as pd df = pd.DataFrame(empty) df = df[0].str.split("--", expand=True) data1 = pd.DataFrame(df[0]) data2 = pd.DataFrame(df[1]) dt = data2.set_index(data1[0]) dt4 = dt.T dictionary = dt4.to_dict(orient="index") list1 = [] # list.append(a) list1.append(dictionary[1]) # # final.append("image--"+str(dictionary[1]).replace("\'",'"')) print('--------------------') # print(namelist) import json # JSON data: x = nothing # python object to be appended y = {"image": dictionary[1]} # parsing JSON string: z = json.loads(x) # appending the data z.update(y) # the result is a JSON string: # print(json.dumps(z)) zlist.append(z) #############################################creating csv##################################### # print(final) # print(imagelist) # final.append('image--' + str(imagelist)) # import requests # import json # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create' # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 # payload1 = json.dumps(zlist) # # print('--------------------------------------------------------------------------') # #print(payload1) # headers = { # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1', # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af', # 'Content-Type': 'application/json' # } # response = requests.request("POST", url, headers=headers, data=payload1) # # print("##############################################################") # print(payload1) # #print(zlist) # # import os # # if 'BusinessCards Created Successfully' in response.text: # # print('present') # # os.remove(found) # # else: # # print('not present') # df1.to_json('visitingcard.json') # data = df1.to_json('visiting.json', orient='records') # print(data) # return render_template('index.html') # return response.text return z # return zlist # @app.route('/upload_BusinessCards', methods=["POST"]) # def mainfunction(): # Dataset = request.get_json() # if len(Dataset)==1: # # predict(Dataset) # return multiplecards(Dataset) # else: # # multiplecards(Dataset) # return multiplecards(Dataset) ################################################################################### Resume parser ################################################################################################### @app.route("/upload_resume", methods=["POST"]) def predict_resume(): Dataset = request.get_json() # data = {'visiting': Dataset} # a=url_list[0] a = Dataset # a = url_list # print(a) x = a['FileData'] # print(x) y = a['FileName'] y = y.replace(' ', '') y = y.replace('&', '') y = y.replace('@', '') z = a['FileType'] # CreatedBy=a['CreatedBy'] name = y + '.' + z print(name) # img_data = x.encode() img_data = x.encode() import base64 with open('./Resume_parser/upload_resume/' + name, "wb") as fh: fh.write(base64.decodebytes(img_data)) # cmd = "python ./Resume_parser/resume1.0.multiprocessing.py" + " " + str('./Resume_parser/upload_resume/' + name) # os.system(cmd) # f = "./resume_upload" # f = os.listdir(f) f = './Resume_parser/upload_resume/' + name found = './Resume_parser/upload_resume/' + name print('this from resumepy file') print(f) def docx_to_txt(): import docx2txt import glob text = '' for file in glob.glob(found): c = docx2txt.process(file) c = c.rstrip("\n") toPrint = c d = ' '.join(i for i in toPrint.split()) d = d.rstrip() text += d docx_to_txt.text = text def doc_to_txt(): import docx2txt import glob text = '' # for file in glob.glob(found): c = docx2txt.process(f) c = c.rstrip("\n") toPrint = c d = ' '.join(i for i in toPrint.split()) d = d.rstrip() text += d doc_to_txt.text = text def pdf_to_txt(): import sys import fitz fname = found doc = fitz.open(fname) text = "" for page in doc: text = text + str(page.get_text()) pdf_to_txt.text = " ".join(text.split('\n')) # for file in f: print('checking for filetype') if f.endswith('.doc'): doc_to_txt() x = doc_to_txt.text elif f.endswith('.docx'): docx_to_txt() x = docx_to_txt.text elif f.endswith('.pdf'): pdf_to_txt() x = pdf_to_txt.text doc = nlp_model(x) k = [] l = [] for ent in doc.ents: # print(f'{ent.label_.upper():{30}}- {ent.text}') k.append(ent.label_.upper()) l.append(ent.text) columns = k rows = [l] import pandas as pd data = pd.DataFrame(rows, columns=columns) df = data data = df.T data.to_csv('./Resume_parser/Ad1.csv', index=True) data = pd.read_csv('./Resume_parser/Ad1.csv') # print(data) data.rename({data.columns[-2]: 'Key'}, axis=1, inplace=True) data.rename({data.columns[-1]: 'Values'}, axis=1, inplace=True) data.to_csv('./Resume_parser/Ad1.csv', index=False) ##################################################################################################### # ModelName = "text-davinci-003" # prompt_value = 'find designation in key value pairs from below text?' + "/n" + str(x) # max_token_value = 300 # # usertext= request.get_data() # # output = usertext.decode() # # print(output) # import os # import openai # # print(usertext) # openai.api_key = "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" # # userinput='fibonacci series in python' # import os # import openai # # openai.api_key = os.getenv("OPENAI_API_KEY") # response_text = openai.Completion.create( # model=ModelName, # prompt=prompt_value, # temperature=0, # max_tokens=max_token_value, # top_p=1, # frequency_penalty=0, # presence_penalty=0, # stop=["\"\"\""] # ) # a = response_text['choices'] # data = a[0]['text'] # data=data.replace('\n','$@$') # data=data.replace('$@$$@$','') # #data=data.replace(':','') # print(data) # data=data.replace('Designation','POSITION') # data=data.split('$@$') # print(data) # import pandas as pd # desgnaition=pd.DataFrame(data) # desgnaition=desgnaition[0].str.split(':',expand=True) # desgnaition.columns=['Key','Values'] # print(desgnaition) # data= pd.read_csv('./Resume_parser/Ad1.csv') # frames = [data,desgnaition] # result = pd.concat(frames,axis=0) # result.to_csv('./Resume_parser/Ad1.csv', index=False) ######################################################################################################## # df2 = pd.read_csv('./Ad1.csv') x1 = pd.read_csv('D:/projects/C01app/Resume_parser/AD11.csv') tp = pd.read_csv('./Resume_parser/Ad1.csv') # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] merge = pd.merge(tp, x1, on='Key', how='right') merge.to_csv('./Resume_parser/AD.csv', index=False) df2 = pd.read_csv('./Resume_parser/AD.csv') # print(df2) df2 = df2.T df2.to_csv('./Resume_parser/path.csv', index=False, header=False) df1 = pd.read_csv('./Resume_parser/path.csv') df1.to_json('./Resume_parser/firstjson.json', orient="index") print(df1) doc = nlp_model1(x) k = [] l = [] for ent in doc.ents: # print(f'{ent.label_.upper():{30}}- {ent.text}') k.append(ent.label_.upper()) l.append(ent.text) columns = k rows = [l] data = pd.DataFrame(rows, columns=columns) df = data data = df.T data.to_csv('./Resume_parser/Ad2.csv', index=True) data = pd.read_csv('./Resume_parser/Ad2.csv') data.rename({data.columns[-2]: 'Key'}, axis=1, inplace=True) data.rename({data.columns[-1]: 'Values'}, axis=1, inplace=True) data.to_csv('./Resume_parser/Ad2.csv', index=False) import pandas as pd import json dflist = [] x = pd.read_csv('D:/projects/C01app/Resume_parser/PG.csv') tp = pd.read_csv('./Resume_parser/Ad2.csv') # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] merge = pd.merge(x, tp, on='Key', how='left') import numpy as np merge = merge.replace(np.nan, '', regex=True) merge.to_csv('./Resume_parser/PGmerge.csv', index=False) dfPG = pd.read_csv('./Resume_parser/PGmerge.csv') import numpy as np dfPG = dfPG.replace({np.nan: None}) x2 = dfPG.iloc[:, -2].tolist() y2 = dfPG.iloc[:, -1].tolist() z1 = dict(zip(x2, y2)) dflist.append(z1) # u1 = json.dumps(z1) import pandas as pd x = pd.read_csv('D:/projects/C01app/Resume_parser/UG.csv') tp = pd.read_csv('./Resume_parser/Ad2.csv') # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] merge = pd.merge(x, tp, on='Key', how='left') import numpy as np merge = merge.replace(np.nan, '', regex=True) merge.to_csv('./Resume_parser/UGmerge.csv', index=False) dfUG = pd.read_csv('./Resume_parser/UGmerge.csv') import numpy as np dfUG = dfUG.replace({np.nan: None}) x2 = dfUG.iloc[:, -2].tolist() y2 = dfUG.iloc[:, -1].tolist() z2 = dict(zip(x2, y2)) dflist.append(z2) # u2 = json.dumps(z2) # final = '[' + str(z1) + ',' + str(z2) + ']' # return render_template('resume.html') ############################################################################ import pandas as pd x = pd.read_csv('D:/projects/C01app/Resume_parser/inter.csv') tp = pd.read_csv('./Resume_parser/Ad2.csv') # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] merge = pd.merge(x, tp, on='Key', how='left') import numpy as np merge = merge.replace(np.nan, '', regex=True) merge.to_csv('./Resume_parser/intermerge.csv', index=False) dfinter = pd.read_csv('./Resume_parser/intermerge.csv') import numpy as np dfinter = dfinter.replace({np.nan: None}) x2 = dfinter.iloc[:, -2].tolist() y2 = dfinter.iloc[:, -1].tolist() z3 = dict(zip(x2, y2)) dflist.append(z3) ############################################################################ import pandas as pd x = pd.read_csv('D:/projects/C01app/Resume_parser/SSC.csv') tp = pd.read_csv('./Resume_parser/Ad2.csv') # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] merge = pd.merge(x, tp, on='Key', how='left') import numpy as np merge = merge.replace(np.nan, '', regex=True) merge.to_csv('./Resume_parser/sscmerge.csv', index=False) dfssc = pd.read_csv('./Resume_parser/sscmerge.csv') import numpy as np dfssc = dfssc.replace({np.nan: None}) x2 = dfssc.iloc[:, -2].tolist() y2 = dfssc.iloc[:, -1].tolist() z4 = dict(zip(x2, y2)) dflist.append(z4) ############################################Document############################################################ import base64 empty = [] name = f image = open(name, 'rb') image_read = image.read() image_64_encode = base64.b64encode(image_read) NULL = 'null' # empty.append("ByteData--" + (NULL).strip('""')) image_64_encode = image_64_encode.decode('utf-8') empty.append("FileData--" + str(image_64_encode)) imagedata = name.split("/") imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") imagename1 = str(imagename).split('.') imagename = str(imagename1[-2]).replace("[", "]") empty.append("FileName--" + imagename) empty.append("FilePath--" + "") imageExtension = str(imagename1[-1]).replace("[", "]") empty.append("FileType--" + imageExtension) import pandas as pd df = pd.DataFrame(empty) df = df[0].str.split("--", expand=True) data1 = pd.DataFrame(df[0]) data2 = pd.DataFrame(df[1]) dt = data2.set_index(data1[0]) dt4 = dt.T list = [] dictionary = dt4.to_dict(orient="index") a = { "FileId": 0, "FileData": "", "FileName": "", "FileType": "", "RefId": 0 } list = [] list.append(a) list.append(dictionary[1]) import json with open('./Resume_parser/firstjson.json', 'r') as json_file: json_load = json.load(json_file) # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') import json # JSON data: x = nothing # python object to be appended y = {"EducationDetails": dflist} y1 = {"Document": list} print(y) # parsing JSON string: z = json.loads(x) # appending the data z.update(y) z.update(y1) # the result is a JSON string: # print(json.dumps(z)) print('##########################') # print(z) print('##########################') import requests import json # with open('visitingcard1.json', 'r') as json_file: # json_load = json.load(json_file) # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/resumeparsing/save" #dev # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/resumeparsing/save" # #url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/resumeparsing/save" #testing # payload1 = json.dumps(z) # print('--------------------------------------------------------------------------') # # print(payload1) # headers = { # # 'Authorization': 'stat 53f27e671adf456e974f1d11ceb5db41', # #'Authorization': 'stat 5702ce5a77d34e0381bc2f06588d9bcc',#dev # 'Authorization': 'stat ed5dd14ee2094227849f6bbe2928bff3', #testing # 'Content-Type': 'application/json' # } # response = requests.request("POST", url, headers=headers, data=payload1) # print("##############################################################") # print(response.text) # function_1.var=response # a=str(response.text) files = glob.glob('./resume_upload/*') for f in files: os.remove(f) return z # return 'done' # return render_template('resume.html') # @app.route('/upload_resume', methods=["POST"]) def upload_resume(): if __name__ == "__main__": # print(os.getpid()) url_list = [] Dataset = request.get_json() # id = "100013660000125" url_list.append(Dataset) # multiprocessing with multiprocessing.Pool(processes=1) as pool: results = pool.map(predict_resume, url_list) pool.close() return results[0] @app.route("/Download_resume") def Download_resume(): # try: with open("Ad1.csv", encoding="unicode_escape") as fp: csv = fp.read() return Response(csv, mimetype="text/csv", headers={"Content-disposition": "attachment; filename=Resume.csv"}) ############################################################################## Invoice Parser ################################################################################################### @app.route('/upload_invoice', methods=["POST", "GET"]) def upload_invoice(): Dataset = request.get_json() # data = {'visiting': Dataset} # a=url_list[0] a = Dataset x = a['FileData'] # print(x) y = a['FileName'] z = a['FileType'] # CreatedBy=a['CreatedBy'] name = y + '.' + z print(name) img_data = x.encode() import base64 with open('./Invoice_parser/upload_invoice/' + name, "wb") as fh: fh.write(base64.decodebytes(img_data)) # cmd = "python ./Invoice_parser/invoice.multiprocessing.py" + " " + str('./Invoice_parser/upload_invoice/' + name) # os.system(cmd) ##################################################################################################################################### name = './Invoice_parser/upload_invoice/' + name extension = name.split('.')[-1] def image_to_text(): print('####################### image-to-pdf ################') import cv2 import numpy as np fname = name print(fname) import pytesseract as tess from PIL import Image tess.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe" img = cv2.imread(fname) # img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # kernel = np.ones((1, 1), np.uint8) # img = cv2.dilate(img, kernel, iterations=1) # img = cv2.erode(img, kernel, iterations=1) # img=cv2.threshold(cv2.GaussianBlur(img, (5, 5), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] pdf = tess.image_to_pdf_or_hocr(img, extension="pdf") with open(Current_Working_Directory + "/Invoice_parser/demo.pdf", "w+b", ) as f: f.write(pdf) print('demo created') import fitz fname = Current_Working_Directory + '/Invoice_parser/demo.pdf' doc = fitz.open(fname) text = "" for page in doc: text = text + str(page.get_text()) image_to_text.text = " ".join(text.split("\n")) # result = ocr.ocr( Current_Working_Directory + "/Invoice_parser/demo.pdf" , cls=True) # result = result[0] # txts = [line[1][0] for line in result] # image_to_text.text = "" # for i in txts: # if len(i) < 4: # continue # # print(i+"\n") # image_to_text.text = image_to_text.text + str(i) + "\n" def pdf_to_text(): import fitz fname = name doc = fitz.open(fname) text = "" for page in doc: text = text + str(page.get_text()) pdf_to_text.text = " ".join(text.split("\n")) extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] if extension in extensionlist: print('image' + extension) image_to_text() x = image_to_text.text else: print('pdf' + extension) pdf_to_text() x = pdf_to_text.text import spacy import sys # import fitz # fname = "uploads/0.pdf" # doc = fitz.open(fname) # text = "" # for page in doc: # text = text + str(page.get_text()) # fitz = " ".join(text.split("\n")) # # print(fitz) import pandas as pd doc = nlp_model1(x) k = [] l = [] for ent in doc.ents: # print(f"{ent.label_.upper():{30}}- {ent.text}") k.append(ent.label_.upper()) l.append(ent.text) columns = k rows = [l] data = pd.DataFrame(rows, columns=columns) df = data df = data.T df.to_csv(Current_Working_Directory + "/Invoice_parser/Invoice.csv") import pandas as pd df = pd.read_csv(Current_Working_Directory + "/Invoice_parser/Invoice.csv") # df.head() # df = df.T # new_header = df.iloc[0] # grab the first row for the header # df = df[1:] # take the data less the header row # df.columns = new_header # def df_column_uniquify(df): # df_columns = df.columns # new_columns = [] # for item in df_columns: # counter = 0 # newitem = item # while newitem in new_columns: # counter += 1 # newitem = "{}_{}".format(item, counter) # new_columns.append(newitem) # df.columns = new_columns # return df.T # df = df_column_uniquify(df) # # df=df.T # df.to_csv('final.csv') # df = pd.read_csv('final.csv') df.rename({df.columns[-2]: 'Key'}, axis=1, inplace=True) df.rename({df.columns[-1]: 'Values'}, axis=1, inplace=True) df['Key'] = df['Key'].str.replace('/', '') df['Key'] = df['Key'].str.replace(' ', '') df.to_csv(Current_Working_Directory + '/Invoice_parser/final.csv', index=False) import pandas as pd x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/finalwithcolen.csv') merge = pd.merge(x1, tp, on='Key', how='right') merge1 = merge merge['Values'] = merge['Values'].astype(str) merge = merge['Values'].str.split(":", expand=True) merge.rename({merge.columns[-1]: 'Values'}, axis=1, inplace=True) frames = [merge1['Key'], merge['Values']] result = pd.concat(frames, axis=1) x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/finalwithoutcolen.csv') merged = pd.merge(x1, tp, on='Key', how='right') frames = [result, merged] result1 = pd.concat(frames) result1.to_csv(Current_Working_Directory + '/Invoice_parser/final1.csv', index=False) x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/main.csv') tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final1.csv') # tp = pd.read_csv(Current_Working_Directory + 'Invoice_parser/final.csv') tp['Key'] = tp['Key'].astype(str) tp['Values'] = tp['Values'].astype(str) tp['Key'] = tp['Key'].str.strip() tp['Values'] = tp['Values'].str.strip() merge = pd.merge(tp, x1, on='Key', how='right') merge.to_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', index=False) df2 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') # Import writer class from csv module from csv import writer List = ['PlantCode', " "] with open(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', 'a') as f_object: writer_object = writer(f_object) writer_object.writerow(List) f_object.close() # print(df2) df2 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') print(df2) df2 = df2.T df2.to_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', index=False, header=False) df1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') df1.to_json(Current_Working_Directory + '/Invoice_parser/firstjson.json', orient="index") import pandas as pd x = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/item1.csv') x['Values'] = x['Values'].str.strip() merge = pd.merge(tp, x, on='Key', how='inner') merge = merge.groupby('Key').agg({ 'Values': '/'.join, }).reset_index() z = merge['Values'].str.split('/', expand=True) frames = [merge, z] result1 = pd.concat(frames, axis=1) result1 = result1.drop(['Values'], axis=1) import pandas as pd tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/item1.csv') merge = pd.merge(tp, result1, on='Key', how='inner') merge = merge.T new_header = merge.iloc[0] # grab the first row for the header merge = merge[1:] # take the data less the header row merge.columns = new_header merge = merge.to_dict('records') invoice_Item = merge print(invoice_Item) ####################################Document############################################################ import base64 empty = [] # name = found image = open(name, 'rb') image_read = image.read() image_64_encode = base64.b64encode(image_read) NULL = 'null' # empty.append("ByteData--" + (NULL).strip('""')) image_64_encode = image_64_encode.decode('utf-8') empty.append("FileData--" + str(image_64_encode)) imagedata = name.split("/") imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") imagename1 = str(imagename).split('.') imagename = str(imagename1[-2]).replace("[", "]") empty.append("FileName--" + imagename) empty.append("FilePath--" + name) imageExtension = str(imagename1[-1]).replace("[", "]") empty.append("FileType--" + imageExtension) import pandas as pd df = pd.DataFrame(empty) df = df[0].str.split("--", expand=True) data1 = pd.DataFrame(df[0]) data2 = pd.DataFrame(df[1]) dt = data2.set_index(data1[0]) dt4 = dt.T list = [] dictionary = dt4.to_dict(orient="index") a = { "FileId": 0, "FileData": "", "FileName": "", "FileType": "", "RefId": 0 } list = [] list.append(a) list.append(dictionary[1]) import json with open(Current_Working_Directory + '/Invoice_parser/firstjson.json', 'r') as json_file: json_load = json.load(json_file) # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') import json # JSON data: x = nothing # python object to be appended y = {"InvoiceItems": invoice_Item} y1 = {"Document": list} # parsing JSON string: z = json.loads(x) # appending the data z.update(y) z.update(y1) # print(z) # the result is a JSON string: # print(json.dumps(z)) # print('##########################') # print(z) # print('##########################') # import requests # import json # # with open('visitingcard1.json', 'r') as json_file: # # json_load = json.load(json_file) # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/invoice/createsalesinvoice" # #url="https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/invoice/createsalesinvoice" # payload1 = json.dumps(z) # print('--------------------------------------------------------------------------') # print(payload1) # headers = { # 'Authorization': 'stat 089166c35d4c4d7d941c99d6f8986834', # 'Content-Type': 'application/json' # } # response = requests.request("POST", url, headers=headers, data=payload1) # print("##############################################################") # print(response.text) # import glob # files = glob.glob( # "upload_invoice/*" # ) # for f in files: # os.remove(f) # files = glob.glob( # "uploads/*" # ) # for f in files: # os.remove(f) return z # return render_template('invoice.html') @app.route("/Download_invoice") def Download_invoice(): pass @app.route("/Table") def Table(): pass if __name__ == "__main__": app.run(host='0.0.0.0', port=1112)