diff --git a/Business_cards/Business_cards.py b/Business_cards/Business_cards.py index d94ffd2..24a6185 100644 --- a/Business_cards/Business_cards.py +++ b/Business_cards/Business_cards.py @@ -34,15 +34,16 @@ import time import multiprocessing from PIL import Image from functools import partial -nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME") -nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2") +# nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME") +# nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2") from flask import Flask, render_template, request, redirect, Response, send_file import pandas as pd + ################################################################ -Current_Working_Directory=os.getcwd() -Current_Working_Directory=Current_Working_Directory.replace("\\","/") -nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p") +Current_Working_Directory = os.getcwd() +Current_Working_Directory = Current_Working_Directory.replace("\\", "/") +# nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p") ################################################################ # import spacy @@ -59,7 +60,7 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False) tagger = SequenceTagger.load("flair/ner-english-large") - +# tagger.to("cuda") import datetime app = Flask(__name__) @@ -93,11 +94,11 @@ def card(): def multiplecards(): # print('################## multiple card detection #######################') # print(Dataset) - datalist=[] - zlist=[] + datalist = [] + zlist = [] Dataset = request.get_json() # print(data) - #datalist.append(Dataset) + # datalist.append(Dataset) data = {'visiting': Dataset} for i in data['visiting']: import time @@ -185,7 +186,7 @@ def multiplecards(): import pytesseract as tess from PIL import Image - tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe" + tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf") with open("demo.pdf", "w+b", ) as f: f.write(pdf) @@ -430,18 +431,18 @@ def multiplecards(): verticaltext = x htext = x # print('------------------------------------------------') - #print('############################################################# this is verticaltext #################################################################') + # print('############################################################# this is verticaltext #################################################################') print(verticaltext) htext = htext.replace('\n', ' ') - # print('############################################################# this is htext #############################################################') - #print(htext) + # print('############################################################# this is htext #############################################################') + # print(htext) y = x.replace('\n', ',') y = y.replace(' ', ' ') # y = y.replace(".", " .") horizontaltext = y # print('------------------------------------------------') - #print('############################################################# this is horizontaltext #############################################################') - #print(horizontaltext) + # print('############################################################# this is horizontaltext #############################################################') + # print(horizontaltext) textfile = open("test123456.txt", "w") a = textfile.write(verticaltext) @@ -478,7 +479,7 @@ def multiplecards(): address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', '') - matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) + matches = re.findall(r'-\d{6}\b|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a) for address3 in matches: pass try: @@ -489,8 +490,7 @@ def multiplecards(): except NameError: final.append('ADDRESS--') - - #print('############################################################ Addressmodelworking #############################################################') + # print('############################################################ Addressmodelworking #############################################################') # doc = nlp_model1(textaddress) # addlist = [] @@ -755,28 +755,177 @@ def multiplecards(): final.append("OrganizationName--" + s2) except IndexError: - org_name() - organisation() + company() + # org_name() + # organisation() # final.append("OrganizationName--") + ################################################### Email###################################################### + import re + from email_scraper import scrape_emails + s = list(scrape_emails(horizontaltext)) + email_id1 = s + import re + email_id=[] +# Define a function to extract email addresses from a text + def extract_emails(text): + email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b' + return re.findall(email_pattern, text) + +# List of text strings + + +# Iterate through the list and extract email addresses from each value + for text in email_id1 : + email_addresses = extract_emails(text) + + # Print the extracted email addresses + if email_addresses: + # print("Email addresses in the text:") + for email in email_addresses: + #print(email) + email_id.append(email) + + else: + print("No email addresses found in the text.") + + # Remove "email" if it exists within square brackets + email_id = [item.replace("email", "").replace("Email", "").replace("E-mail", "") for item in email_id] + + # ************************************* CONTACT PERSON ******************************************************************* try: - final.append( - "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", - "") + - PErsons[ - 1].replace(":PER", "").replace('"', '')) + my_string='Hello' + print(my_string[-6]) + # final.append( + # "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", + # "") + + # PErsons[ + # 1].replace(":PER", "").replace('"', ''))+PErsons[2].replace(":PER", "").replace("[", "").replace('"', '').replace("]","") + except IndexError: try: final.append( "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace( - '"', - '')) + '"', '')) + person_name=PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace('"', '').replace(' ','') + if not email_id: + final.append('ContactEmail--') + final.append('OrganizationEmail--') + else: + per_Name=[] + per_Name.append(person_name) + + print(email_id) + + def calculate_matching_percentage(word_list, words): + def calculate_single_matching_percentage(word, item): + max_length = max(len(word), len(item)) + word = word.upper() + item = item.strip().replace(" ", "").upper() + matching_chars = sum(1 for c1, c2 in zip(item, word) if c1 == c2) + return (matching_chars / max_length) * 100 + + highest_percentage = 0.0 + highest_matching_item = None + + for word in words: + word = word.upper() + for item in word_list: + original_item = item + item = item.strip().replace(" ", "").upper() + + matching_percentage = calculate_single_matching_percentage(word, item) + + if matching_percentage > highest_percentage: + highest_percentage = matching_percentage + highest_matching_item = original_item + + return highest_matching_item, highest_percentage + + word_list = email_id + per_Name = [item.split('.')[1] if '.' in item else item for item in per_Name] + print(per_Name) + + word2 = per_Name + + for word in word2: + highest_matching_item, highest_percentage = calculate_matching_percentage(word_list, [word]) + if highest_matching_item is not None: + print( + f"For '{word}', the highest matching percentage is {highest_percentage:.2f}% with '{highest_matching_item}'") + else: + print(f"For '{word}', no matches found.") + #final.append('OrganistaionEmail--' + email_id[0]) + + if len(word_list) == 1: + + if highest_percentage >= 15: + print(highest_matching_item) + final.append( + 'ContactEmail--' + str(highest_matching_item).replace("[", "").replace("]", "").replace( + "\\n", "").replace("'", "")) + final.append('OrganizationEmail--') + + else: + print('not matched') + final.append('OrganistaionEmail--' + email_id[0]) + final.append('ContactEmail--') + + + else: + print('it as more elemnt') + if highest_percentage >= 15: + print(highest_matching_item) + final.append('ContactEmail--' + str(highest_matching_item).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "")) + + + # Given list of email addresses + email_list = word_list + + # Email address to remove + email_to_remove = highest_matching_item + + # Check if the email address is in the list before removing it + if email_to_remove in email_list: + email_list.remove(email_to_remove) + print(f"'{email_to_remove}' has been removed from the list.") + else: + print(f"'{email_to_remove}' is not in the list.") + + # Print the updated list + print("Updated email list:", email_list) + final.append('OrganistaionEmail--' + str(email_list[0]).replace("[", "").replace("]", "").replace("\\n","").replace("'", "")) + else: + final.append('OrganistaionEmail--' + str(email_id[0]) +','+ str(email_id[1])) + + except IndexError: - org_name() - contactpersonname() - # final.append("CONTACTPERSONNAME--") + # org_name() + # contactpersonname() + final.append("CONTACTPERSONNAME--") + + + if len(email_id) > 1: + final.append( + 'OrganizationEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", + "")) + final.append( + 'ContactEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( + "'", + "")) + else: + try: + final.append( + 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( + "'", + "")) + final.append('OrganizationEmail--') + except IndexError: + final.append('ContactEmail--') + final.append('OrganizationEmail--') + ###############address flair##################### try: @@ -873,7 +1022,9 @@ def multiplecards(): # print(verticaltext) numbers = phonenumbers.PhoneNumberMatcher( - verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN") + verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', '').replace('-', + '').replace( + ' ', ''), "IN") for number in numbers: number = str(number).split(")") @@ -884,8 +1035,7 @@ def multiplecards(): import re # Input list of strings -# num =[' 7227906777Extn1204634444'] - + # num =[' 7227906777Extn1204634444'] # Define a regular expression pattern to split when text is present pattern = r'[a-zA-Z]+' @@ -937,42 +1087,6 @@ def multiplecards(): # except IndexError: # pass - ################################################### Email###################################################### - import re - from email_scraper import scrape_emails - s = list(scrape_emails(horizontaltext)) - email_id = s - - # email_id = [] - # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext) - # for match in matches: - # email_id.append(match) - - # # final.append('Email--' + match) - # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "") - # # final.append(email_) - - # # final.append('Email--' + email_) - # # remove_list.append(email_) - if len(email_id) > 1: - final.append( - 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", - "")) - final.append( - 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace( - "'", - "")) - else: - try: - final.append( - 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace( - "'", - "")) - final.append('OrganizationEmail--') - except IndexError: - final.append('ContactEmail--') - final.append('OrganizationEmail--') - ###############PINCODE############ pinlst = [] @@ -1008,9 +1122,9 @@ def multiplecards(): # print(addrespinlst) import pgeocode - #print(line12) + # print(line12) import re - matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) + matche1 = re.findall(r'-\d{6}\b|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12) for i in matche1: address3 = i.replace(' ', '').replace('-', '') pinlst.append(address3) @@ -1036,7 +1150,7 @@ def multiplecards(): final.append(county_name) except (IndexError, NameError): - final.append("PinCode1--"+" ") + final.append("PinCode1--" + " ") final.append("country_code--") final.append("LandMark1--") final.append("state_name--") @@ -1086,7 +1200,7 @@ def multiplecards(): imagename1 = str(imagename).split('.') imagename = str(imagename1[-2]).replace("[", "]") empty.append("FileName--" + imagename) - empty.append("FilePath--"+ "") + empty.append("FilePath--" + "") imageExtension = str(imagename1[-1]).replace("[", "]") empty.append("FileType--" + imageExtension) image.close() @@ -1115,61 +1229,802 @@ def multiplecards(): z.update(y) # the result is a JSON string: # print(json.dumps(z)) - + zlist.append(z) #############################################creating csv##################################### - # print(final) + # print(final) - + # print(imagelist) + # final.append('image--' + str(imagelist)) + # import requests + # import json - - #print(imagelist) - #final.append('image--' + str(imagelist)) - # import requests - # import json + # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev + # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing + # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test + # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create' + # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 + # payload1 = json.dumps(zlist) + # # print('--------------------------------------------------------------------------') + # #print(payload1) + # headers = { + # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev + # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing + # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1', + # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 + # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo + # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af', - # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev - # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing - # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test - # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create' - # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01 - # payload1 = json.dumps(zlist) - # # print('--------------------------------------------------------------------------') - # #print(payload1) - # headers = { - # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev - # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing - # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1', - # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01 - # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo - # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af', + # 'Content-Type': 'application/json' + # } + # response = requests.request("POST", url, headers=headers, data=payload1) + # # print("##############################################################") + + # print(payload1) + # #print(zlist) + # # import os + # # if 'BusinessCards Created Successfully' in response.text: + # # print('present') + # # os.remove(found) + # # else: + # # print('not present') + + # df1.to_json('visitingcard.json') + # data = df1.to_json('visiting.json', orient='records') + # print(data) + + # return render_template('index.html') + + # return response.text + + return z + # return zlist - # 'Content-Type': 'application/json' - # } - # response = requests.request("POST", url, headers=headers, data=payload1) - # # print("##############################################################") +# @app.route('/upload_BusinessCards', methods=["POST"]) +# def mainfunction(): +# Dataset = request.get_json() +# if len(Dataset)==1: +# # predict(Dataset) +# return multiplecards(Dataset) +# else: +# # multiplecards(Dataset) +# return multiplecards(Dataset) - # print(payload1) - # #print(zlist) - # # import os - # # if 'BusinessCards Created Successfully' in response.text: - # # print('present') - # # os.remove(found) - # # else: - # # print('not present') - # df1.to_json('visitingcard.json') - # data = df1.to_json('visiting.json', orient='records') - # print(data) +################################################################################### Resume parser ################################################################################################### - #return render_template('index.html') - +@app.route("/upload_resume", methods=["POST"]) +def predict_resume(): + Dataset = request.get_json() + # data = {'visiting': Dataset} + # a=url_list[0] + a = Dataset + # a = url_list + # print(a) + x = a['FileData'] + # print(x) + y = a['FileName'] + y = y.replace(' ', '') + y = y.replace('&', '') + y = y.replace('@', '') + z = a['FileType'] + # CreatedBy=a['CreatedBy'] - #return response.text - #return z - return zlist + name = y + '.' + z + print(name) + + # img_data = x.encode() + + img_data = x.encode() + + import base64 + with open('./Resume_parser/upload_resume/' + name, "wb") as fh: + fh.write(base64.decodebytes(img_data)) + # cmd = "python ./Resume_parser/resume1.0.multiprocessing.py" + " " + str('./Resume_parser/upload_resume/' + name) + # os.system(cmd) + + # f = "./resume_upload" + # f = os.listdir(f) + f = './Resume_parser/upload_resume/' + name + found = './Resume_parser/upload_resume/' + name + print('this from resumepy file') + print(f) + + def docx_to_txt(): + import docx2txt + import glob + text = '' + for file in glob.glob(found): + c = docx2txt.process(file) + c = c.rstrip("\n") + toPrint = c + d = ' '.join(i for i in toPrint.split()) + d = d.rstrip() + text += d + docx_to_txt.text = text + + def doc_to_txt(): + import docx2txt + import glob + text = '' + # for file in glob.glob(found): + c = docx2txt.process(f) + c = c.rstrip("\n") + toPrint = c + d = ' '.join(i for i in toPrint.split()) + d = d.rstrip() + text += d + doc_to_txt.text = text + + def pdf_to_txt(): + import sys + import fitz + fname = found + doc = fitz.open(fname) + text = "" + for page in doc: + text = text + str(page.get_text()) + pdf_to_txt.text = " ".join(text.split('\n')) + + # for file in f: + print('checking for filetype') + if f.endswith('.doc'): + doc_to_txt() + x = doc_to_txt.text + elif f.endswith('.docx'): + docx_to_txt() + x = docx_to_txt.text + elif f.endswith('.pdf'): + pdf_to_txt() + x = pdf_to_txt.text + + doc = nlp_model(x) + k = [] + l = [] + for ent in doc.ents: + # print(f'{ent.label_.upper():{30}}- {ent.text}') + k.append(ent.label_.upper()) + l.append(ent.text) + columns = k + rows = [l] + import pandas as pd + data = pd.DataFrame(rows, columns=columns) + df = data + + data = df.T + + data.to_csv('./Resume_parser/Ad1.csv', index=True) + + data = pd.read_csv('./Resume_parser/Ad1.csv') + # print(data) + data.rename({data.columns[-2]: 'Key'}, axis=1, inplace=True) + data.rename({data.columns[-1]: 'Values'}, axis=1, inplace=True) + data.to_csv('./Resume_parser/Ad1.csv', index=False) + ##################################################################################################### + # ModelName = "text-davinci-003" + # prompt_value = 'find designation in key value pairs from below text?' + "/n" + str(x) + # max_token_value = 300 + + # # usertext= request.get_data() + # # output = usertext.decode() + # # print(output) + # import os + # import openai + + # # print(usertext) + # openai.api_key = "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" + # # userinput='fibonacci series in python' + # import os + # import openai + + # # openai.api_key = os.getenv("OPENAI_API_KEY") + + # response_text = openai.Completion.create( + # model=ModelName, + # prompt=prompt_value, + # temperature=0, + # max_tokens=max_token_value, + # top_p=1, + # frequency_penalty=0, + # presence_penalty=0, + # stop=["\"\"\""] + # ) + # a = response_text['choices'] + # data = a[0]['text'] + # data=data.replace('\n','$@$') + # data=data.replace('$@$$@$','') + # #data=data.replace(':','') + # print(data) + # data=data.replace('Designation','POSITION') + # data=data.split('$@$') + # print(data) + # import pandas as pd + # desgnaition=pd.DataFrame(data) + # desgnaition=desgnaition[0].str.split(':',expand=True) + # desgnaition.columns=['Key','Values'] + # print(desgnaition) + + # data= pd.read_csv('./Resume_parser/Ad1.csv') + + # frames = [data,desgnaition] + + # result = pd.concat(frames,axis=0) + # result.to_csv('./Resume_parser/Ad1.csv', index=False) + + ######################################################################################################## + # df2 = pd.read_csv('./Ad1.csv') + x1 = pd.read_csv('D:/projects/C01app/Resume_parser/AD11.csv') + tp = pd.read_csv('./Resume_parser/Ad1.csv') + # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] + merge = pd.merge(tp, x1, on='Key', how='right') + merge.to_csv('./Resume_parser/AD.csv', index=False) + df2 = pd.read_csv('./Resume_parser/AD.csv') + # print(df2) + df2 = df2.T + + df2.to_csv('./Resume_parser/path.csv', index=False, header=False) + df1 = pd.read_csv('./Resume_parser/path.csv') + df1.to_json('./Resume_parser/firstjson.json', orient="index") + print(df1) + + doc = nlp_model1(x) + k = [] + l = [] + for ent in doc.ents: + # print(f'{ent.label_.upper():{30}}- {ent.text}') + k.append(ent.label_.upper()) + l.append(ent.text) + columns = k + rows = [l] + data = pd.DataFrame(rows, columns=columns) + df = data + data = df.T + + data.to_csv('./Resume_parser/Ad2.csv', index=True) + data = pd.read_csv('./Resume_parser/Ad2.csv') + data.rename({data.columns[-2]: 'Key'}, axis=1, inplace=True) + data.rename({data.columns[-1]: 'Values'}, axis=1, inplace=True) + data.to_csv('./Resume_parser/Ad2.csv', index=False) + import pandas as pd + import json + dflist = [] + x = pd.read_csv('D:/projects/C01app/Resume_parser/PG.csv') + tp = pd.read_csv('./Resume_parser/Ad2.csv') + # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] + merge = pd.merge(x, tp, on='Key', how='left') + import numpy as np + merge = merge.replace(np.nan, '', regex=True) + merge.to_csv('./Resume_parser/PGmerge.csv', index=False) + + dfPG = pd.read_csv('./Resume_parser/PGmerge.csv') + import numpy as np + dfPG = dfPG.replace({np.nan: None}) + x2 = dfPG.iloc[:, -2].tolist() + y2 = dfPG.iloc[:, -1].tolist() + z1 = dict(zip(x2, y2)) + dflist.append(z1) + # u1 = json.dumps(z1) + import pandas as pd + + x = pd.read_csv('D:/projects/C01app/Resume_parser/UG.csv') + tp = pd.read_csv('./Resume_parser/Ad2.csv') + # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] + merge = pd.merge(x, tp, on='Key', how='left') + import numpy as np + merge = merge.replace(np.nan, '', regex=True) + merge.to_csv('./Resume_parser/UGmerge.csv', index=False) + + dfUG = pd.read_csv('./Resume_parser/UGmerge.csv') + import numpy as np + dfUG = dfUG.replace({np.nan: None}) + x2 = dfUG.iloc[:, -2].tolist() + y2 = dfUG.iloc[:, -1].tolist() + z2 = dict(zip(x2, y2)) + dflist.append(z2) + # u2 = json.dumps(z2) + # final = '[' + str(z1) + ',' + str(z2) + ']' + # return render_template('resume.html') + + ############################################################################ + import pandas as pd + + x = pd.read_csv('D:/projects/C01app/Resume_parser/inter.csv') + tp = pd.read_csv('./Resume_parser/Ad2.csv') + # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] + merge = pd.merge(x, tp, on='Key', how='left') + import numpy as np + merge = merge.replace(np.nan, '', regex=True) + merge.to_csv('./Resume_parser/intermerge.csv', index=False) + + dfinter = pd.read_csv('./Resume_parser/intermerge.csv') + import numpy as np + dfinter = dfinter.replace({np.nan: None}) + x2 = dfinter.iloc[:, -2].tolist() + y2 = dfinter.iloc[:, -1].tolist() + z3 = dict(zip(x2, y2)) + dflist.append(z3) + + ############################################################################ + import pandas as pd + + x = pd.read_csv('D:/projects/C01app/Resume_parser/SSC.csv') + tp = pd.read_csv('./Resume_parser/Ad2.csv') + # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')] + merge = pd.merge(x, tp, on='Key', how='left') + import numpy as np + merge = merge.replace(np.nan, '', regex=True) + merge.to_csv('./Resume_parser/sscmerge.csv', index=False) + + dfssc = pd.read_csv('./Resume_parser/sscmerge.csv') + import numpy as np + dfssc = dfssc.replace({np.nan: None}) + x2 = dfssc.iloc[:, -2].tolist() + y2 = dfssc.iloc[:, -1].tolist() + z4 = dict(zip(x2, y2)) + dflist.append(z4) + ############################################Document############################################################ + import base64 + empty = [] + name = f + image = open(name, 'rb') + image_read = image.read() + image_64_encode = base64.b64encode(image_read) + NULL = 'null' + # empty.append("ByteData--" + (NULL).strip('""')) + image_64_encode = image_64_encode.decode('utf-8') + empty.append("FileData--" + str(image_64_encode)) + imagedata = name.split("/") + imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") + imagename1 = str(imagename).split('.') + + imagename = str(imagename1[-2]).replace("[", "]") + empty.append("FileName--" + imagename) + empty.append("FilePath--" + "") + imageExtension = str(imagename1[-1]).replace("[", "]") + empty.append("FileType--" + imageExtension) + + import pandas as pd + df = pd.DataFrame(empty) + df = df[0].str.split("--", expand=True) + data1 = pd.DataFrame(df[0]) + data2 = pd.DataFrame(df[1]) + dt = data2.set_index(data1[0]) + + dt4 = dt.T + list = [] + dictionary = dt4.to_dict(orient="index") + + a = { + "FileId": 0, + "FileData": "", + "FileName": "", + "FileType": "", + "RefId": 0 + } + list = [] + + list.append(a) + list.append(dictionary[1]) + + import json + + with open('./Resume_parser/firstjson.json', 'r') as json_file: + json_load = json.load(json_file) + + # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" + + nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') + import json + + # JSON data: + x = nothing + + # python object to be appended + y = {"EducationDetails": dflist} + y1 = {"Document": list} + print(y) + # parsing JSON string: + z = json.loads(x) + + # appending the data + z.update(y) + z.update(y1) + + # the result is a JSON string: + # print(json.dumps(z)) + print('##########################') + # print(z) + print('##########################') + import requests + import json + + # with open('visitingcard1.json', 'r') as json_file: + # json_load = json.load(json_file) + # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/resumeparsing/save" #dev + # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/resumeparsing/save" + # #url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/resumeparsing/save" #testing + # payload1 = json.dumps(z) + # print('--------------------------------------------------------------------------') + # # print(payload1) + # headers = { + # # 'Authorization': 'stat 53f27e671adf456e974f1d11ceb5db41', + # #'Authorization': 'stat 5702ce5a77d34e0381bc2f06588d9bcc',#dev + # 'Authorization': 'stat ed5dd14ee2094227849f6bbe2928bff3', #testing + # 'Content-Type': 'application/json' + # } + # response = requests.request("POST", url, headers=headers, data=payload1) + # print("##############################################################") + + # print(response.text) + # function_1.var=response + # a=str(response.text) + + files = glob.glob('./resume_upload/*') + for f in files: + os.remove(f) + + return z + # return 'done' + + +# return render_template('resume.html') + + +# @app.route('/upload_resume', methods=["POST"]) +def upload_resume(): + if __name__ == "__main__": + # print(os.getpid()) + + url_list = [] + Dataset = request.get_json() + # id = "100013660000125" + url_list.append(Dataset) + # multiprocessing + with multiprocessing.Pool(processes=1) as pool: + results = pool.map(predict_resume, url_list) + + pool.close() + return results[0] + + +@app.route("/Download_resume") +def Download_resume(): + # try: + with open("Ad1.csv", encoding="unicode_escape") as fp: + csv = fp.read() + return Response(csv, mimetype="text/csv", headers={"Content-disposition": "attachment; filename=Resume.csv"}) + + +############################################################################## Invoice Parser ################################################################################################### + +@app.route('/upload_invoice', methods=["POST", "GET"]) +def upload_invoice(): + Dataset = request.get_json() + # data = {'visiting': Dataset} + # a=url_list[0] + a = Dataset + + x = a['FileData'] + # print(x) + y = a['FileName'] + z = a['FileType'] + # CreatedBy=a['CreatedBy'] + + name = y + '.' + z + print(name) + + img_data = x.encode() + + import base64 + with open('./Invoice_parser/upload_invoice/' + name, "wb") as fh: + fh.write(base64.decodebytes(img_data)) + + # cmd = "python ./Invoice_parser/invoice.multiprocessing.py" + " " + str('./Invoice_parser/upload_invoice/' + name) + # os.system(cmd) + ##################################################################################################################################### + + name = './Invoice_parser/upload_invoice/' + name + extension = name.split('.')[-1] + + def image_to_text(): + print('####################### image-to-pdf ################') + + import cv2 + import numpy as np + fname = name + print(fname) + import pytesseract as tess + from PIL import Image + + tess.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe" + img = cv2.imread(fname) + # img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) + + # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # kernel = np.ones((1, 1), np.uint8) + # img = cv2.dilate(img, kernel, iterations=1) + # img = cv2.erode(img, kernel, iterations=1) + + # img=cv2.threshold(cv2.GaussianBlur(img, (5, 5), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] + pdf = tess.image_to_pdf_or_hocr(img, extension="pdf") + with open(Current_Working_Directory + "/Invoice_parser/demo.pdf", "w+b", ) as f: + f.write(pdf) + print('demo created') + import fitz + fname = Current_Working_Directory + '/Invoice_parser/demo.pdf' + doc = fitz.open(fname) + text = "" + for page in doc: + text = text + str(page.get_text()) + image_to_text.text = " ".join(text.split("\n")) + + # result = ocr.ocr( Current_Working_Directory + "/Invoice_parser/demo.pdf" , cls=True) + # result = result[0] + + # txts = [line[1][0] for line in result] + + # image_to_text.text = "" + # for i in txts: + # if len(i) < 4: + # continue + # # print(i+"\n") + # image_to_text.text = image_to_text.text + str(i) + "\n" + + def pdf_to_text(): + import fitz + fname = name + doc = fitz.open(fname) + text = "" + for page in doc: + text = text + str(page.get_text()) + pdf_to_text.text = " ".join(text.split("\n")) + + extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg'] + + if extension in extensionlist: + print('image' + extension) + image_to_text() + x = image_to_text.text + + else: + print('pdf' + extension) + pdf_to_text() + x = pdf_to_text.text + + import spacy + import sys + # import fitz + # fname = "uploads/0.pdf" + # doc = fitz.open(fname) + # text = "" + # for page in doc: + # text = text + str(page.get_text()) + # fitz = " ".join(text.split("\n")) + # # print(fitz) + import pandas as pd + + doc = nlp_model1(x) + k = [] + l = [] + for ent in doc.ents: + # print(f"{ent.label_.upper():{30}}- {ent.text}") + k.append(ent.label_.upper()) + l.append(ent.text) + columns = k + rows = [l] + data = pd.DataFrame(rows, columns=columns) + df = data + df = data.T + + df.to_csv(Current_Working_Directory + "/Invoice_parser/Invoice.csv") + import pandas as pd + df = pd.read_csv(Current_Working_Directory + "/Invoice_parser/Invoice.csv") + # df.head() + # df = df.T + # new_header = df.iloc[0] # grab the first row for the header + # df = df[1:] # take the data less the header row + # df.columns = new_header + # def df_column_uniquify(df): + # df_columns = df.columns + # new_columns = [] + # for item in df_columns: + # counter = 0 + # newitem = item + # while newitem in new_columns: + # counter += 1 + # newitem = "{}_{}".format(item, counter) + # new_columns.append(newitem) + # df.columns = new_columns + # return df.T + # df = df_column_uniquify(df) + # # df=df.T + # df.to_csv('final.csv') + # df = pd.read_csv('final.csv') + df.rename({df.columns[-2]: 'Key'}, axis=1, inplace=True) + df.rename({df.columns[-1]: 'Values'}, axis=1, inplace=True) + df['Key'] = df['Key'].str.replace('/', '') + df['Key'] = df['Key'].str.replace(' ', '') + df.to_csv(Current_Working_Directory + '/Invoice_parser/final.csv', index=False) + import pandas as pd + x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') + tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/finalwithcolen.csv') + merge = pd.merge(x1, tp, on='Key', how='right') + merge1 = merge + + merge['Values'] = merge['Values'].astype(str) + merge = merge['Values'].str.split(":", expand=True) + merge.rename({merge.columns[-1]: 'Values'}, axis=1, inplace=True) + frames = [merge1['Key'], merge['Values']] + result = pd.concat(frames, axis=1) + x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') + tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/finalwithoutcolen.csv') + merged = pd.merge(x1, tp, on='Key', how='right') + frames = [result, merged] + result1 = pd.concat(frames) + result1.to_csv(Current_Working_Directory + '/Invoice_parser/final1.csv', index=False) + + x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/main.csv') + tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final1.csv') + # tp = pd.read_csv(Current_Working_Directory + 'Invoice_parser/final.csv') + tp['Key'] = tp['Key'].astype(str) + tp['Values'] = tp['Values'].astype(str) + tp['Key'] = tp['Key'].str.strip() + tp['Values'] = tp['Values'].str.strip() + + merge = pd.merge(tp, x1, on='Key', how='right') + merge.to_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', index=False) + df2 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') + + # Import writer class from csv module + from csv import writer + + List = ['PlantCode', " "] + with open(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', 'a') as f_object: + writer_object = writer(f_object) + writer_object.writerow(List) + f_object.close() + # print(df2) + df2 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') + print(df2) + df2 = df2.T + + df2.to_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', index=False, header=False) + + df1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv') + df1.to_json(Current_Working_Directory + '/Invoice_parser/firstjson.json', orient="index") + import pandas as pd + x = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv') + tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/item1.csv') + x['Values'] = x['Values'].str.strip() + merge = pd.merge(tp, x, on='Key', how='inner') + merge = merge.groupby('Key').agg({ + 'Values': '/'.join, + }).reset_index() + z = merge['Values'].str.split('/', expand=True) + frames = [merge, z] + result1 = pd.concat(frames, axis=1) + result1 = result1.drop(['Values'], axis=1) + import pandas as pd + tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/item1.csv') + merge = pd.merge(tp, result1, on='Key', how='inner') + merge = merge.T + new_header = merge.iloc[0] # grab the first row for the header + merge = merge[1:] # take the data less the header row + merge.columns = new_header + + merge = merge.to_dict('records') + invoice_Item = merge + print(invoice_Item) + + ####################################Document############################################################ + + import base64 + empty = [] + # name = found + image = open(name, 'rb') + image_read = image.read() + image_64_encode = base64.b64encode(image_read) + NULL = 'null' + # empty.append("ByteData--" + (NULL).strip('""')) + image_64_encode = image_64_encode.decode('utf-8') + empty.append("FileData--" + str(image_64_encode)) + imagedata = name.split("/") + imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "") + imagename1 = str(imagename).split('.') + imagename = str(imagename1[-2]).replace("[", "]") + empty.append("FileName--" + imagename) + empty.append("FilePath--" + name) + imageExtension = str(imagename1[-1]).replace("[", "]") + empty.append("FileType--" + imageExtension) + import pandas as pd + df = pd.DataFrame(empty) + df = df[0].str.split("--", expand=True) + data1 = pd.DataFrame(df[0]) + data2 = pd.DataFrame(df[1]) + dt = data2.set_index(data1[0]) + dt4 = dt.T + list = [] + dictionary = dt4.to_dict(orient="index") + + a = { + "FileId": 0, + "FileData": "", + "FileName": "", + "FileType": "", + "RefId": 0 + } + list = [] + list.append(a) + list.append(dictionary[1]) + import json + with open(Current_Working_Directory + '/Invoice_parser/firstjson.json', 'r') as json_file: + json_load = json.load(json_file) + # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create" + nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}') + import json + + # JSON data: + x = nothing + # python object to be appended + y = {"InvoiceItems": invoice_Item} + y1 = {"Document": list} + # parsing JSON string: + z = json.loads(x) + # appending the data + z.update(y) + z.update(y1) + # print(z) + # the result is a JSON string: + # print(json.dumps(z)) + # print('##########################') + # print(z) + # print('##########################') + # import requests + # import json + # # with open('visitingcard1.json', 'r') as json_file: + # # json_load = json.load(json_file) + # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/invoice/createsalesinvoice" + # #url="https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/invoice/createsalesinvoice" + # payload1 = json.dumps(z) + # print('--------------------------------------------------------------------------') + # print(payload1) + # headers = { + # 'Authorization': 'stat 089166c35d4c4d7d941c99d6f8986834', + # 'Content-Type': 'application/json' + # } + # response = requests.request("POST", url, headers=headers, data=payload1) + # print("##############################################################") + # print(response.text) + # import glob + # files = glob.glob( + # "upload_invoice/*" + # ) + # for f in files: + # os.remove(f) + # files = glob.glob( + # "uploads/*" + # ) + # for f in files: + # os.remove(f) + + return z + + # return render_template('invoice.html') + + +@app.route("/Download_invoice") +def Download_invoice(): + pass + + +@app.route("/Table") +def Table(): + pass if __name__ == "__main__": - app.run(host='0.0.0.0', port=1112) \ No newline at end of file + app.run(host='0.0.0.0', port=1112) + +