From 05cefc1ac2afbf0a11b770e8bc7f9e513653a7e0 Mon Sep 17 00:00:00 2001 From: SadhulaSaiKumar Date: Thu, 11 Jan 2024 04:41:41 +0000 Subject: [PATCH] Update 'Business_cards/Business_cards.py' --- Business_cards/Business_cards.py | 1155 ++++-------------------------- 1 file changed, 122 insertions(+), 1033 deletions(-) diff --git a/Business_cards/Business_cards.py b/Business_cards/Business_cards.py index 24a6185..70c3c19 100644 --- a/Business_cards/Business_cards.py +++ b/Business_cards/Business_cards.py @@ -15,11 +15,6 @@ from functools import partial from urlextract import URLExtract import pytesseract as tess from PIL import Image -# from doctr.io import DocumentFile -# from doctr.models import ocr_predictor -# model = ocr_predictor(pretrained=True) -# load tagger -###################################################### import os import glob @@ -29,20 +24,16 @@ import cv2 import matplotlib from werkzeug.utils import secure_filename import requests -import spacy +#import spacy import time import multiprocessing from PIL import Image from functools import partial -# nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME") -# nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2") -from flask import Flask, render_template, request, redirect, Response, send_file import pandas as pd - ################################################################ -Current_Working_Directory = os.getcwd() -Current_Working_Directory = Current_Working_Directory.replace("\\", "/") +Current_Working_Directory=os.getcwd() +Current_Working_Directory=Current_Working_Directory.replace("\\","/") # nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p") ################################################################ @@ -58,9 +49,9 @@ model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncas from paddleocr import PaddleOCR, draw_ocr -ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False) +ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True) tagger = SequenceTagger.load("flair/ner-english-large") -# tagger.to("cuda") + import datetime app = Flask(__name__) @@ -68,23 +59,7 @@ app = Flask(__name__) # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/" - @app.route('/', methods=['GET']) -def home(): - return render_template('home.html') - - -@app.route('/resume', methods=['GET']) -def resume(): - return render_template('resume.html') - - -@app.route('/invoice', methods=['GET']) -def invoice(): - return render_template('invoice.html') - - -@app.route('/card', methods=['GET']) def card(): return render_template('card.html') @@ -94,11 +69,13 @@ def card(): def multiplecards(): # print('################## multiple card detection #######################') # print(Dataset) - datalist = [] - zlist = [] + from pathlib import Path + Path("multicards").mkdir(exist_ok=True) + datalist=[] + zlist=[] Dataset = request.get_json() # print(data) - # datalist.append(Dataset) + #datalist.append(Dataset) data = {'visiting': Dataset} for i in data['visiting']: import time @@ -186,7 +163,7 @@ def multiplecards(): import pytesseract as tess from PIL import Image - tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" + tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf") with open("demo.pdf", "w+b", ) as f: f.write(pdf) @@ -431,18 +408,18 @@ def multiplecards(): verticaltext = x htext = x # print('------------------------------------------------') - # print('############################################################# this is verticaltext #################################################################') - print(verticaltext) + #print('############################################################# this is verticaltext #################################################################') + # print(verticaltext) htext = htext.replace('\n', ' ') - # print('############################################################# this is htext #############################################################') - # print(htext) + # print('############################################################# this is htext #############################################################') + #print(htext) y = x.replace('\n', ',') y = y.replace(' ', ' ') # y = y.replace(".", " .") horizontaltext = y # print('------------------------------------------------') - # print('############################################################# this is horizontaltext #############################################################') - # print(horizontaltext) + #print('############################################################# this is horizontaltext #############################################################') + #print(horizontaltext) textfile = open("test123456.txt", "w") a = textfile.write(verticaltext) @@ -479,7 +456,7 @@ def multiplecards(): address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', '') - matches = re.findall(r'-\d{6}\b|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) + matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) for address3 in matches: pass try: @@ -488,9 +465,9 @@ def multiplecards(): addrespinlst.append(Address) except NameError: - final.append('ADDRESS--') - # print('############################################################ Addressmodelworking #############################################################') + print( + '############################################################ Addressmodelworking #############################################################') # doc = nlp_model1(textaddress) # addlist = [] @@ -755,177 +732,28 @@ def multiplecards(): final.append("OrganizationName--" + s2) except IndexError: - company() - # org_name() - # organisation() + org_name() + organisation() # final.append("OrganizationName--") - ################################################### Email###################################################### - import re - from email_scraper import scrape_emails - s = list(scrape_emails(horizontaltext)) - email_id1 = s - import re - email_id=[] -# Define a function to extract email addresses from a text - def extract_emails(text): - email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b' - return re.findall(email_pattern, text) - -# List of text strings - - -# Iterate through the list and extract email addresses from each value - for text in email_id1 : - email_addresses = extract_emails(text) - - # Print the extracted email addresses - if email_addresses: - # print("Email addresses in the text:") - for email in email_addresses: - #print(email) - email_id.append(email) - - else: - print("No email addresses found in the text.") - - # Remove "email" if it exists within square brackets - email_id = [item.replace("email", "").replace("Email", "").replace("E-mail", "") for item in email_id] - - # ************************************* CONTACT PERSON ******************************************************************* try: - my_string='Hello' - print(my_string[-6]) - # final.append( - # "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", - # "") + - # PErsons[ - # 1].replace(":PER", "").replace('"', ''))+PErsons[2].replace(":PER", "").replace("[", "").replace('"', '').replace("]","") - + final.append( + "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", + "") + + PErsons[ + 1].replace(":PER", "").replace('"', '')) except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( - '"', '')) - person_name=PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace('"', '').replace(' ','') - if not email_id: - final.append('ContactEmail--') - final.append('OrganizationEmail--') - else: - per_Name=[] - per_Name.append(person_name) - - print(email_id) - - def calculate_matching_percentage(word_list, words): - def calculate_single_matching_percentage(word, item): - max_length = max(len(word), len(item)) - word = word.upper() - item = item.strip().replace(" ", "").upper() - matching_chars = sum(1 for c1, c2 in zip(item, word) if c1 == c2) - return (matching_chars / max_length) * 100 - - highest_percentage = 0.0 - highest_matching_item = None - - for word in words: - word = word.upper() - for item in word_list: - original_item = item - item = item.strip().replace(" ", "").upper() - - matching_percentage = calculate_single_matching_percentage(word, item) - - if matching_percentage > highest_percentage: - highest_percentage = matching_percentage - highest_matching_item = original_item - - return highest_matching_item, highest_percentage - - word_list = email_id - per_Name = [item.split('.')[1] if '.' in item else item for item in per_Name] - print(per_Name) - - word2 = per_Name - - for word in word2: - highest_matching_item, highest_percentage = calculate_matching_percentage(word_list, [word]) - if highest_matching_item is not None: - print( - f"For '{word}', the highest matching percentage is {highest_percentage:.2f}% with '{highest_matching_item}'") - else: - print(f"For '{word}', no matches found.") - #final.append('OrganistaionEmail--' + email_id[0]) - - if len(word_list) == 1: - - if highest_percentage >= 15: - print(highest_matching_item) - final.append( - 'ContactEmail--' + str(highest_matching_item).replace("[", "").replace("]", "").replace( - "\\n", "").replace("'", "")) - final.append('OrganizationEmail--') - - else: - print('not matched') - final.append('OrganistaionEmail--' + email_id[0]) - final.append('ContactEmail--') - - - else: - print('it as more elemnt') - if highest_percentage >= 15: - print(highest_matching_item) - final.append('ContactEmail--' + str(highest_matching_item).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) - - - # Given list of email addresses - email_list = word_list - - # Email address to remove - email_to_remove = highest_matching_item - - # Check if the email address is in the list before removing it - if email_to_remove in email_list: - email_list.remove(email_to_remove) - print(f"'{email_to_remove}' has been removed from the list.") - else: - print(f"'{email_to_remove}' is not in the list.") - - # Print the updated list - print("Updated email list:", email_list) - final.append('OrganistaionEmail--' + str(email_list[0]).replace("[", "").replace("]", "").replace("\\n","").replace("'", "")) - else: - final.append('OrganistaionEmail--' + str(email_id[0]) +','+ str(email_id[1])) - - + '"', + '')) except IndexError: - # org_name() - # contactpersonname() - final.append("CONTACTPERSONNAME--") - - - if len(email_id) > 1: - final.append( - 'OrganizationEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", - "")) - final.append( - 'ContactEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( - "'", - "")) - else: - try: - final.append( - 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( - "'", - "")) - final.append('OrganizationEmail--') - except IndexError: - final.append('ContactEmail--') - final.append('OrganizationEmail--') - + org_name() + contactpersonname() + # final.append("CONTACTPERSONNAME--") ###############address flair##################### try: @@ -1022,39 +850,12 @@ def multiplecards(): # print(verticaltext) numbers = phonenumbers.PhoneNumberMatcher( - verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', '').replace('-', - '').replace( - ' ', ''), "IN") + verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN") for number in numbers: number = str(number).split(")") num.append(number[1]) # num.append(number[-1]) - - print(num) - import re - - # Input list of strings - # num =[' 7227906777Extn1204634444'] - - # Define a regular expression pattern to split when text is present - pattern = r'[a-zA-Z]+' - - # Function to split a string based on the pattern - def split_string(text): - return re.split(pattern, text) - - # Process each line in the list - split_lines = [split_string(line) for line in num] - - # Flatten the list of lists into a single list - split_lines = [item for sublist in split_lines for item in sublist] - - # Remove any empty strings - num = [item for item in split_lines if item] - - # Print the split lines - print(num) if len(num) == 0: final.append("ContactNumber--") final.append("OrganizationNumber--") @@ -1087,6 +888,42 @@ def multiplecards(): # except IndexError: # pass + ################################################### Email###################################################### + import re + from email_scraper import scrape_emails + s = list(scrape_emails(horizontaltext)) + email_id = s + + # email_id = [] + # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext) + # for match in matches: + # email_id.append(match) + + # # final.append('Email--' + match) + # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "") + # # final.append(email_) + + # # final.append('Email--' + email_) + # # remove_list.append(email_) + if len(email_id) > 1: + final.append( + 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", + "")) + final.append( + 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( + "'", + "")) + else: + try: + final.append( + 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( + "'", + "")) + final.append('OrganizationEmail--') + except IndexError: + final.append('ContactEmail--') + final.append('OrganizationEmail--') + ###############PINCODE############ pinlst = [] @@ -1122,9 +959,9 @@ def multiplecards(): # print(addrespinlst) import pgeocode - # print(line12) + #print(line12) import re - matche1 = re.findall(r'-\d{6}\b|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) + matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) for i in matche1: address3 = i.replace(' ', '').replace('-', '') pinlst.append(address3) @@ -1150,7 +987,7 @@ def multiplecards(): final.append(county_name) except (IndexError, NameError): - final.append("PinCode1--" + " ") + final.append("PinCode1--") final.append("country_code--") final.append("LandMark1--") final.append("state_name--") @@ -1169,11 +1006,6 @@ def multiplecards(): df1.to_csv('path123.csv', index=False) df2 = pd.read_csv('path123.csv') print(df2) - if df2['Values'].isnull().all(): - print("Column 'Column2' is empty.") - return 'Invalid image' - else: - pass df2 = df2.T df2.to_csv('path1.csv', index=False, header=False) df1 = pd.read_csv('path1.csv') @@ -1200,7 +1032,7 @@ def multiplecards(): imagename1 = str(imagename).split('.') imagename = str(imagename1[-2]).replace("[", "]") empty.append("FileName--" + imagename) - empty.append("FilePath--" + "") + empty.append("FilePath--"+ "") imageExtension = str(imagename1[-1]).replace("[", "]") empty.append("FileType--" + imageExtension) image.close() @@ -1229,802 +1061,59 @@ def multiplecards(): z.update(y) # the result is a JSON string: # print(json.dumps(z)) - + zlist.append(z) #############################################creating csv##################################### - # print(final) + #print(final) + #print(imagelist) + #final.append('image--' + str(imagelist)) + # import requests + # import json + + # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev + # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing + # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test + # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create' + # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 + # payload1 = json.dumps(zlist) + # # print('--------------------------------------------------------------------------') + # #print(payload1) + # headers = { + # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev + # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing + # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1', + # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 + # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo + # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af', + + + # 'Content-Type': 'application/json' + # } + # response = requests.request("POST", url, headers=headers, data=payload1) + # # print("##############################################################") + + # print(payload1) + # #print(zlist) + # # import os + # # if 'BusinessCards Created Successfully' in response.text: + # # print('present') + # # os.remove(found) + # # else: + # # print('not present') + + # df1.to_json('visitingcard.json') + # data = df1.to_json('visiting.json', orient='records') + # print(data) + + #return render_template('index.html') + + + #return response.text + #return z + return zlist - # print(imagelist) - # final.append('image--' + str(imagelist)) - # import requests - # import json - # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev - # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing - # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test - # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create' - # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 - # payload1 = json.dumps(zlist) - # # print('--------------------------------------------------------------------------') - # #print(payload1) - # headers = { - # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev - # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing - # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1', - # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 - # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo - # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af', - - # 'Content-Type': 'application/json' - # } - # response = requests.request("POST", url, headers=headers, data=payload1) - # # print("##############################################################") - - # print(payload1) - # #print(zlist) - # # import os - # # if 'BusinessCards Created Successfully' in response.text: - # # print('present') - # # os.remove(found) - # # else: - # # print('not present') - - # df1.to_json('visitingcard.json') - # data = df1.to_json('visiting.json', orient='records') - # print(data) - - # return render_template('index.html') - - # return response.text - - return z - # return zlist - - -# @app.route('/upload_BusinessCards', methods=["POST"]) -# def mainfunction(): -# Dataset = request.get_json() -# if len(Dataset)==1: -# # predict(Dataset) -# return multiplecards(Dataset) -# else: -# # multiplecards(Dataset) -# return multiplecards(Dataset) - - -################################################################################### Resume parser ################################################################################################### - -@app.route("/upload_resume", methods=["POST"]) -def predict_resume(): - Dataset = request.get_json() - # data = {'visiting': Dataset} - # a=url_list[0] - a = Dataset - # a = url_list - # print(a) - x = a['FileData'] - # print(x) - y = a['FileName'] - y = y.replace(' ', '') - y = y.replace('&', '') - y = y.replace('@', '') - z = a['FileType'] - # CreatedBy=a['CreatedBy'] - - name = y + '.' + z - print(name) - - # img_data = x.encode() - - img_data = x.encode() - - import base64 - with open('./Resume_parser/upload_resume/' + name, "wb") as fh: - fh.write(base64.decodebytes(img_data)) - # cmd = "python ./Resume_parser/resume1.0.multiprocessing.py" + " " + str('./Resume_parser/upload_resume/' + name) - # os.system(cmd) - - # f = "./resume_upload" - # f = os.listdir(f) - f = './Resume_parser/upload_resume/' + name - found = './Resume_parser/upload_resume/' + name - print('this from resumepy file') - print(f) - - def docx_to_txt(): - import docx2txt - import glob - text = '' - for file in glob.glob(found): - c = docx2txt.process(file) - c = c.rstrip("\n") - toPrint = c - d = ' '.join(i for i in toPrint.split()) - d = d.rstrip() - text += d - docx_to_txt.text = text - - def doc_to_txt(): - import docx2txt - import glob - text = '' - # for file in glob.glob(found): - c = docx2txt.process(f) - c = c.rstrip("\n") - toPrint = c - d = ' '.join(i for i in toPrint.split()) - d = d.rstrip() - text += d - doc_to_txt.text = text - - def pdf_to_txt(): - import sys - import fitz - fname = found - doc = fitz.open(fname) - text = "" - for page in doc: - text = text + str(page.get_text()) - pdf_to_txt.text = " ".join(text.split('\n')) - - # for file in f: - print('checking for filetype') - if f.endswith('.doc'): - doc_to_txt() - x = doc_to_txt.text - elif f.endswith('.docx'): - docx_to_txt() - x = docx_to_txt.text - elif f.endswith('.pdf'): - pdf_to_txt() - x = pdf_to_txt.text - - doc = nlp_model(x) - k = [] - l = [] - for ent in doc.ents: - # print(f'{ent.label_.upper():{30}}- {ent.text}') - k.append(ent.label_.upper()) - l.append(ent.text) - columns = k - rows = [l] - import pandas as pd - data = pd.DataFrame(rows, columns=columns) - df = data - - data = df.T - - data.to_csv('./Resume_parser/Ad1.csv', index=True) - - data = pd.read_csv('./Resume_parser/Ad1.csv') - # print(data) - data.rename({data.columns[-2]: 'Key'}, axis=1, inplace=True) - data.rename({data.columns[-1]: 'Values'}, axis=1, inplace=True) - data.to_csv('./Resume_parser/Ad1.csv', index=False) - ##################################################################################################### - # ModelName = "text-davinci-003" - # prompt_value = 'find designation in key value pairs from below text?' + "/n" + str(x) - # max_token_value = 300 - - # # usertext= request.get_data() - # # output = usertext.decode() - # # print(output) - # import os - # import openai - - # # print(usertext) - # openai.api_key = "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" - # # userinput='fibonacci series in python' - # import os - # import openai - - # # openai.api_key = os.getenv("OPENAI_API_KEY") - - # response_text = openai.Completion.create( - # model=ModelName, - # prompt=prompt_value, - # temperature=0, - # max_tokens=max_token_value, - # top_p=1, - # frequency_penalty=0, - # presence_penalty=0, - # stop=["\"\"\""] - # ) - # a = response_text['choices'] - # data = a[0]['text'] - # data=data.replace('\n','$@$') - # data=data.replace('$@$$@$','') - # #data=data.replace(':','') - # print(data) - # data=data.replace('Designation','POSITION') - # data=data.split('$@$') - # print(data) - # import pandas as pd - # desgnaition=pd.DataFrame(data) - # desgnaition=desgnaition[0].str.split(':',expand=True) - # desgnaition.columns=['Key','Values'] - # print(desgnaition) - - # data= pd.read_csv('./Resume_parser/Ad1.csv') - - # frames = [data,desgnaition] - - # result = pd.concat(frames,axis=0) - # result.to_csv('./Resume_parser/Ad1.csv', index=False) - - ######################################################################################################## - # df2 = pd.read_csv('./Ad1.csv') - x1 = pd.read_csv('D:/projects/C01app/Resume_parser/AD11.csv') - tp = pd.read_csv('./Resume_parser/Ad1.csv') - # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] - merge = pd.merge(tp, x1, on='Key', how='right') - merge.to_csv('./Resume_parser/AD.csv', index=False) - df2 = pd.read_csv('./Resume_parser/AD.csv') - # print(df2) - df2 = df2.T - - df2.to_csv('./Resume_parser/path.csv', index=False, header=False) - df1 = pd.read_csv('./Resume_parser/path.csv') - df1.to_json('./Resume_parser/firstjson.json', orient="index") - print(df1) - - doc = nlp_model1(x) - k = [] - l = [] - for ent in doc.ents: - # print(f'{ent.label_.upper():{30}}- {ent.text}') - k.append(ent.label_.upper()) - l.append(ent.text) - columns = k - rows = [l] - data = pd.DataFrame(rows, columns=columns) - df = data - data = df.T - - data.to_csv('./Resume_parser/Ad2.csv', index=True) - data = pd.read_csv('./Resume_parser/Ad2.csv') - data.rename({data.columns[-2]: 'Key'}, axis=1, inplace=True) - data.rename({data.columns[-1]: 'Values'}, axis=1, inplace=True) - data.to_csv('./Resume_parser/Ad2.csv', index=False) - import pandas as pd - import json - dflist = [] - x = pd.read_csv('D:/projects/C01app/Resume_parser/PG.csv') - tp = pd.read_csv('./Resume_parser/Ad2.csv') - # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] - merge = pd.merge(x, tp, on='Key', how='left') - import numpy as np - merge = merge.replace(np.nan, '', regex=True) - merge.to_csv('./Resume_parser/PGmerge.csv', index=False) - - dfPG = pd.read_csv('./Resume_parser/PGmerge.csv') - import numpy as np - dfPG = dfPG.replace({np.nan: None}) - x2 = dfPG.iloc[:, -2].tolist() - y2 = dfPG.iloc[:, -1].tolist() - z1 = dict(zip(x2, y2)) - dflist.append(z1) - # u1 = json.dumps(z1) - import pandas as pd - - x = pd.read_csv('D:/projects/C01app/Resume_parser/UG.csv') - tp = pd.read_csv('./Resume_parser/Ad2.csv') - # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] - merge = pd.merge(x, tp, on='Key', how='left') - import numpy as np - merge = merge.replace(np.nan, '', regex=True) - merge.to_csv('./Resume_parser/UGmerge.csv', index=False) - - dfUG = pd.read_csv('./Resume_parser/UGmerge.csv') - import numpy as np - dfUG = dfUG.replace({np.nan: None}) - x2 = dfUG.iloc[:, -2].tolist() - y2 = dfUG.iloc[:, -1].tolist() - z2 = dict(zip(x2, y2)) - dflist.append(z2) - # u2 = json.dumps(z2) - # final = '[' + str(z1) + ',' + str(z2) + ']' - # return render_template('resume.html') - - ############################################################################ - import pandas as pd - - x = pd.read_csv('D:/projects/C01app/Resume_parser/inter.csv') - tp = pd.read_csv('./Resume_parser/Ad2.csv') - # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] - merge = pd.merge(x, tp, on='Key', how='left') - import numpy as np - merge = merge.replace(np.nan, '', regex=True) - merge.to_csv('./Resume_parser/intermerge.csv', index=False) - - dfinter = pd.read_csv('./Resume_parser/intermerge.csv') - import numpy as np - dfinter = dfinter.replace({np.nan: None}) - x2 = dfinter.iloc[:, -2].tolist() - y2 = dfinter.iloc[:, -1].tolist() - z3 = dict(zip(x2, y2)) - dflist.append(z3) - - ############################################################################ - import pandas as pd - - x = pd.read_csv('D:/projects/C01app/Resume_parser/SSC.csv') - tp = pd.read_csv('./Resume_parser/Ad2.csv') - # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] - merge = pd.merge(x, tp, on='Key', how='left') - import numpy as np - merge = merge.replace(np.nan, '', regex=True) - merge.to_csv('./Resume_parser/sscmerge.csv', index=False) - - dfssc = pd.read_csv('./Resume_parser/sscmerge.csv') - import numpy as np - dfssc = dfssc.replace({np.nan: None}) - x2 = dfssc.iloc[:, -2].tolist() - y2 = dfssc.iloc[:, -1].tolist() - z4 = dict(zip(x2, y2)) - dflist.append(z4) - ############################################Document############################################################ - import base64 - empty = [] - name = f - image = open(name, 'rb') - image_read = image.read() - image_64_encode = base64.b64encode(image_read) - NULL = 'null' - # empty.append("ByteData--" + (NULL).strip('""')) - image_64_encode = image_64_encode.decode('utf-8') - empty.append("FileData--" + str(image_64_encode)) - imagedata = name.split("/") - imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") - imagename1 = str(imagename).split('.') - - imagename = str(imagename1[-2]).replace("[", "]") - empty.append("FileName--" + imagename) - empty.append("FilePath--" + "") - imageExtension = str(imagename1[-1]).replace("[", "]") - empty.append("FileType--" + imageExtension) - - import pandas as pd - df = pd.DataFrame(empty) - df = df[0].str.split("--", expand=True) - data1 = pd.DataFrame(df[0]) - data2 = pd.DataFrame(df[1]) - dt = data2.set_index(data1[0]) - - dt4 = dt.T - list = [] - dictionary = dt4.to_dict(orient="index") - - a = { - "FileId": 0, - "FileData": "", - "FileName": "", - "FileType": "", - "RefId": 0 - } - list = [] - - list.append(a) - list.append(dictionary[1]) - - import json - - with open('./Resume_parser/firstjson.json', 'r') as json_file: - json_load = json.load(json_file) - - # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" - - nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') - import json - - # JSON data: - x = nothing - - # python object to be appended - y = {"EducationDetails": dflist} - y1 = {"Document": list} - print(y) - # parsing JSON string: - z = json.loads(x) - - # appending the data - z.update(y) - z.update(y1) - - # the result is a JSON string: - # print(json.dumps(z)) - print('##########################') - # print(z) - print('##########################') - import requests - import json - - # with open('visitingcard1.json', 'r') as json_file: - # json_load = json.load(json_file) - # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/resumeparsing/save" #dev - # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/resumeparsing/save" - # #url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/resumeparsing/save" #testing - # payload1 = json.dumps(z) - # print('--------------------------------------------------------------------------') - # # print(payload1) - # headers = { - # # 'Authorization': 'stat 53f27e671adf456e974f1d11ceb5db41', - # #'Authorization': 'stat 5702ce5a77d34e0381bc2f06588d9bcc',#dev - # 'Authorization': 'stat ed5dd14ee2094227849f6bbe2928bff3', #testing - # 'Content-Type': 'application/json' - # } - # response = requests.request("POST", url, headers=headers, data=payload1) - # print("##############################################################") - - # print(response.text) - # function_1.var=response - # a=str(response.text) - - files = glob.glob('./resume_upload/*') - for f in files: - os.remove(f) - - return z - # return 'done' - - -# return render_template('resume.html') - - -# @app.route('/upload_resume', methods=["POST"]) -def upload_resume(): - if __name__ == "__main__": - # print(os.getpid()) - - url_list = [] - Dataset = request.get_json() - # id = "100013660000125" - url_list.append(Dataset) - # multiprocessing - with multiprocessing.Pool(processes=1) as pool: - results = pool.map(predict_resume, url_list) - - pool.close() - return results[0] - - -@app.route("/Download_resume") -def Download_resume(): - # try: - with open("Ad1.csv", encoding="unicode_escape") as fp: - csv = fp.read() - return Response(csv, mimetype="text/csv", headers={"Content-disposition": "attachment; filename=Resume.csv"}) - - -############################################################################## Invoice Parser ################################################################################################### - -@app.route('/upload_invoice', methods=["POST", "GET"]) -def upload_invoice(): - Dataset = request.get_json() - # data = {'visiting': Dataset} - # a=url_list[0] - a = Dataset - - x = a['FileData'] - # print(x) - y = a['FileName'] - z = a['FileType'] - # CreatedBy=a['CreatedBy'] - - name = y + '.' + z - print(name) - - img_data = x.encode() - - import base64 - with open('./Invoice_parser/upload_invoice/' + name, "wb") as fh: - fh.write(base64.decodebytes(img_data)) - - # cmd = "python ./Invoice_parser/invoice.multiprocessing.py" + " " + str('./Invoice_parser/upload_invoice/' + name) - # os.system(cmd) - ##################################################################################################################################### - - name = './Invoice_parser/upload_invoice/' + name - extension = name.split('.')[-1] - - def image_to_text(): - print('####################### image-to-pdf ################') - - import cv2 - import numpy as np - fname = name - print(fname) - import pytesseract as tess - from PIL import Image - - tess.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe" - img = cv2.imread(fname) - # img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) - - # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - # kernel = np.ones((1, 1), np.uint8) - # img = cv2.dilate(img, kernel, iterations=1) - # img = cv2.erode(img, kernel, iterations=1) - - # img=cv2.threshold(cv2.GaussianBlur(img, (5, 5), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] - pdf = tess.image_to_pdf_or_hocr(img, extension="pdf") - with open(Current_Working_Directory + "/Invoice_parser/demo.pdf", "w+b", ) as f: - f.write(pdf) - print('demo created') - import fitz - fname = Current_Working_Directory + '/Invoice_parser/demo.pdf' - doc = fitz.open(fname) - text = "" - for page in doc: - text = text + str(page.get_text()) - image_to_text.text = " ".join(text.split("\n")) - - # result = ocr.ocr( Current_Working_Directory + "/Invoice_parser/demo.pdf" , cls=True) - # result = result[0] - - # txts = [line[1][0] for line in result] - - # image_to_text.text = "" - # for i in txts: - # if len(i) < 4: - # continue - # # print(i+"\n") - # image_to_text.text = image_to_text.text + str(i) + "\n" - - def pdf_to_text(): - import fitz - fname = name - doc = fitz.open(fname) - text = "" - for page in doc: - text = text + str(page.get_text()) - pdf_to_text.text = " ".join(text.split("\n")) - - extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] - - if extension in extensionlist: - print('image' + extension) - image_to_text() - x = image_to_text.text - - else: - print('pdf' + extension) - pdf_to_text() - x = pdf_to_text.text - - import spacy - import sys - # import fitz - # fname = "uploads/0.pdf" - # doc = fitz.open(fname) - # text = "" - # for page in doc: - # text = text + str(page.get_text()) - # fitz = " ".join(text.split("\n")) - # # print(fitz) - import pandas as pd - - doc = nlp_model1(x) - k = [] - l = [] - for ent in doc.ents: - # print(f"{ent.label_.upper():{30}}- {ent.text}") - k.append(ent.label_.upper()) - l.append(ent.text) - columns = k - rows = [l] - data = pd.DataFrame(rows, columns=columns) - df = data - df = data.T - - df.to_csv(Current_Working_Directory + "/Invoice_parser/Invoice.csv") - import pandas as pd - df = pd.read_csv(Current_Working_Directory + "/Invoice_parser/Invoice.csv") - # df.head() - # df = df.T - # new_header = df.iloc[0] # grab the first row for the header - # df = df[1:] # take the data less the header row - # df.columns = new_header - # def df_column_uniquify(df): - # df_columns = df.columns - # new_columns = [] - # for item in df_columns: - # counter = 0 - # newitem = item - # while newitem in new_columns: - # counter += 1 - # newitem = "{}_{}".format(item, counter) - # new_columns.append(newitem) - # df.columns = new_columns - # return df.T - # df = df_column_uniquify(df) - # # df=df.T - # df.to_csv('final.csv') - # df = pd.read_csv('final.csv') - df.rename({df.columns[-2]: 'Key'}, axis=1, inplace=True) - df.rename({df.columns[-1]: 'Values'}, axis=1, inplace=True) - df['Key'] = df['Key'].str.replace('/', '') - df['Key'] = df['Key'].str.replace(' ', '') - df.to_csv(Current_Working_Directory + '/Invoice_parser/final.csv', index=False) - import pandas as pd - x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') - tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/finalwithcolen.csv') - merge = pd.merge(x1, tp, on='Key', how='right') - merge1 = merge - - merge['Values'] = merge['Values'].astype(str) - merge = merge['Values'].str.split(":", expand=True) - merge.rename({merge.columns[-1]: 'Values'}, axis=1, inplace=True) - frames = [merge1['Key'], merge['Values']] - result = pd.concat(frames, axis=1) - x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') - tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/finalwithoutcolen.csv') - merged = pd.merge(x1, tp, on='Key', how='right') - frames = [result, merged] - result1 = pd.concat(frames) - result1.to_csv(Current_Working_Directory + '/Invoice_parser/final1.csv', index=False) - - x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/main.csv') - tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final1.csv') - # tp = pd.read_csv(Current_Working_Directory + 'Invoice_parser/final.csv') - tp['Key'] = tp['Key'].astype(str) - tp['Values'] = tp['Values'].astype(str) - tp['Key'] = tp['Key'].str.strip() - tp['Values'] = tp['Values'].str.strip() - - merge = pd.merge(tp, x1, on='Key', how='right') - merge.to_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', index=False) - df2 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') - - # Import writer class from csv module - from csv import writer - - List = ['PlantCode', " "] - with open(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', 'a') as f_object: - writer_object = writer(f_object) - writer_object.writerow(List) - f_object.close() - # print(df2) - df2 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') - print(df2) - df2 = df2.T - - df2.to_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', index=False, header=False) - - df1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') - df1.to_json(Current_Working_Directory + '/Invoice_parser/firstjson.json', orient="index") - import pandas as pd - x = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') - tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/item1.csv') - x['Values'] = x['Values'].str.strip() - merge = pd.merge(tp, x, on='Key', how='inner') - merge = merge.groupby('Key').agg({ - 'Values': '/'.join, - }).reset_index() - z = merge['Values'].str.split('/', expand=True) - frames = [merge, z] - result1 = pd.concat(frames, axis=1) - result1 = result1.drop(['Values'], axis=1) - import pandas as pd - tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/item1.csv') - merge = pd.merge(tp, result1, on='Key', how='inner') - merge = merge.T - new_header = merge.iloc[0] # grab the first row for the header - merge = merge[1:] # take the data less the header row - merge.columns = new_header - - merge = merge.to_dict('records') - invoice_Item = merge - print(invoice_Item) - - ####################################Document############################################################ - - import base64 - empty = [] - # name = found - image = open(name, 'rb') - image_read = image.read() - image_64_encode = base64.b64encode(image_read) - NULL = 'null' - # empty.append("ByteData--" + (NULL).strip('""')) - image_64_encode = image_64_encode.decode('utf-8') - empty.append("FileData--" + str(image_64_encode)) - imagedata = name.split("/") - imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") - imagename1 = str(imagename).split('.') - imagename = str(imagename1[-2]).replace("[", "]") - empty.append("FileName--" + imagename) - empty.append("FilePath--" + name) - imageExtension = str(imagename1[-1]).replace("[", "]") - empty.append("FileType--" + imageExtension) - import pandas as pd - df = pd.DataFrame(empty) - df = df[0].str.split("--", expand=True) - data1 = pd.DataFrame(df[0]) - data2 = pd.DataFrame(df[1]) - dt = data2.set_index(data1[0]) - dt4 = dt.T - list = [] - dictionary = dt4.to_dict(orient="index") - - a = { - "FileId": 0, - "FileData": "", - "FileName": "", - "FileType": "", - "RefId": 0 - } - list = [] - list.append(a) - list.append(dictionary[1]) - import json - with open(Current_Working_Directory + '/Invoice_parser/firstjson.json', 'r') as json_file: - json_load = json.load(json_file) - # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" - nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') - import json - - # JSON data: - x = nothing - # python object to be appended - y = {"InvoiceItems": invoice_Item} - y1 = {"Document": list} - # parsing JSON string: - z = json.loads(x) - # appending the data - z.update(y) - z.update(y1) - # print(z) - # the result is a JSON string: - # print(json.dumps(z)) - # print('##########################') - # print(z) - # print('##########################') - # import requests - # import json - # # with open('visitingcard1.json', 'r') as json_file: - # # json_load = json.load(json_file) - # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/invoice/createsalesinvoice" - # #url="https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/invoice/createsalesinvoice" - # payload1 = json.dumps(z) - # print('--------------------------------------------------------------------------') - # print(payload1) - # headers = { - # 'Authorization': 'stat 089166c35d4c4d7d941c99d6f8986834', - # 'Content-Type': 'application/json' - # } - # response = requests.request("POST", url, headers=headers, data=payload1) - # print("##############################################################") - # print(response.text) - # import glob - # files = glob.glob( - # "upload_invoice/*" - # ) - # for f in files: - # os.remove(f) - # files = glob.glob( - # "uploads/*" - # ) - # for f in files: - # os.remove(f) - - return z - - # return render_template('invoice.html') - - -@app.route("/Download_invoice") -def Download_invoice(): - pass - - -@app.route("/Table") -def Table(): - pass if __name__ == "__main__": - app.run(host='0.0.0.0', port=1112) - - + app.run(host='0.0.0.0', port=1112) \ No newline at end of file