説明なし
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

Business_cards.py 74KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030
  1. from flask import Flask, render_template, request, redirect, Response, send_file
  2. import os
  3. # import openai
  4. import requests
  5. import pandas as pd
  6. import pgeocode
  7. from email_scraper import scrape_emails
  8. import phonenumbers
  9. from pdfminer.high_level import extract_text
  10. import pytesseract
  11. import time
  12. import multiprocessing
  13. from PIL import Image
  14. from functools import partial
  15. from urlextract import URLExtract
  16. import pytesseract as tess
  17. from PIL import Image
  18. # from doctr.io import DocumentFile
  19. # from doctr.models import ocr_predictor
  20. # model = ocr_predictor(pretrained=True)
  21. # load tagger
  22. ######################################################
  23. import os
  24. import glob
  25. from pytesseract import *
  26. import shutil
  27. import cv2
  28. import matplotlib
  29. from werkzeug.utils import secure_filename
  30. import requests
  31. import spacy
  32. import time
  33. import multiprocessing
  34. from PIL import Image
  35. from functools import partial
  36. # nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME")
  37. # nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2")
  38. from flask import Flask, render_template, request, redirect, Response, send_file
  39. import pandas as pd
  40. ################################################################
  41. Current_Working_Directory = os.getcwd()
  42. Current_Working_Directory = Current_Working_Directory.replace("\\", "/")
  43. # nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")
  44. ################################################################
  45. # import spacy
  46. # nlp_model1 = spacy.load('./ADD3001.2')
  47. from flair.data import Sentence
  48. from flair.models import SequenceTagger
  49. from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
  50. tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  51. model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  52. from paddleocr import PaddleOCR, draw_ocr
  53. ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False)
  54. tagger = SequenceTagger.load("flair/ner-english-large")
  55. # tagger.to("cuda")
  56. import datetime
  57. app = Flask(__name__)
  58. # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
  59. @app.route('/', methods=['GET'])
  60. def home():
  61. return render_template('home.html')
  62. @app.route('/resume', methods=['GET'])
  63. def resume():
  64. return render_template('resume.html')
  65. @app.route('/invoice', methods=['GET'])
  66. def invoice():
  67. return render_template('invoice.html')
  68. @app.route('/card', methods=['GET'])
  69. def card():
  70. return render_template('card.html')
  71. @app.route('/upload_BusinessCards', methods=["POST"])
  72. # @app.route('/multiplecards', methods=["POST"])
  73. def multiplecards():
  74. # print('################## multiple card detection #######################')
  75. # print(Dataset)
  76. datalist = []
  77. zlist = []
  78. Dataset = request.get_json()
  79. # print(data)
  80. # datalist.append(Dataset)
  81. data = {'visiting': Dataset}
  82. for i in data['visiting']:
  83. import time
  84. # time.sleep(1)
  85. a = i
  86. x = a['FileData']
  87. # print(x)
  88. y = a['FileName']
  89. z = a['FileType']
  90. # CreatedBy=a['CreatedBy']
  91. name = y + '.' + z
  92. # print(name)
  93. # print(y)
  94. # image = y.split("/")
  95. # filename=image[-1]
  96. # print(x)
  97. img_data = x.encode()
  98. import base64
  99. with open('./multicards/' + name, "wb") as fh:
  100. fh.write(base64.decodebytes(img_data))
  101. # print(i)
  102. # import os
  103. # import glob
  104. # for i in glob.glob('./multipleupload/*'):
  105. found = './multicards/' + name
  106. print(found)
  107. extension = found.split('.')[-1]
  108. # for root, dirs, fils in os.glob('./multipleupload'):
  109. # for name in files:
  110. # foundfile= os.path.join(root, name)
  111. # print(foundfile)
  112. import re
  113. import csv
  114. import glob
  115. import os
  116. # import pytesseract
  117. # import cv2
  118. import numpy as np
  119. import glob
  120. import os
  121. import cv2
  122. import requests
  123. final = []
  124. # final.append('assignto--'+CreatedBy)
  125. imagelist = []
  126. # print(found)
  127. remove_list = []
  128. import os
  129. import glob
  130. import pdfminer
  131. # import os
  132. # ts = 0
  133. # for file_name in glob.glob('./upload/*'):
  134. # fts = os.path.getmtime(file_name)
  135. # if fts > ts:
  136. # ts = fts
  137. # found = file_name
  138. # print(found)
  139. # print(extension)
  140. def org_name():
  141. print('org_name is working')
  142. import pytesseract
  143. fname = found
  144. if extension != 'pdf':
  145. img = cv2.imread(fname)
  146. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  147. cv2.imwrite(str(found), img)
  148. from PIL import Image
  149. im = Image.open(found)
  150. im.save("images1.png", dpi=(1200, 1200))
  151. # import pytesseract
  152. fname = "images1.png"
  153. import pytesseract as tess
  154. from PIL import Image
  155. tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
  156. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  157. with open("demo.pdf", "w+b", ) as f:
  158. f.write(pdf)
  159. from pdfminer.high_level import extract_text
  160. text = extract_text('demo.pdf')
  161. # doc = DocumentFile.from_images(found)
  162. # result = model(doc)
  163. # text = result.render()
  164. # from pdfminer.high_level import extract_text
  165. # txt = extract_text('demo.pdf')
  166. else:
  167. from pdfminer.high_level import extract_text
  168. text = extract_text(fname)
  169. sentence = Sentence(text)
  170. # predict NER tags
  171. tagger.predict(sentence)
  172. # print sentence
  173. ko = (sentence)
  174. ko1 = str(ko).split("→")
  175. import pandas as pd
  176. dfg = []
  177. try:
  178. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  179. # os.remove(found)
  180. # return 'Invalid image'
  181. dfg.append(s)
  182. df = pd.DataFrame(dfg)
  183. df = df[0]
  184. df.to_csv("df.csv", index=False)
  185. df1 = pd.read_csv("df.csv")
  186. ve = df1["0"].str.split(",")
  187. fgf = ve.to_list()
  188. dfgh = pd.DataFrame(fgf[0])
  189. maindf = dfgh[0] # .str.split(":")
  190. # maindf.to_csv("main.csv")
  191. main1 = maindf.to_list()
  192. main1
  193. # cv=pd.DataFrame(ve)
  194. # cv
  195. per = ["PER"]
  196. org = ["ORG"]
  197. loc = ["LOC"]
  198. organizations = [i for i in main1 for j in org if j in i]
  199. PErsons = [i for i in main1 for j in per if j in i]
  200. location = [i for i in main1 for j in loc if j in i]
  201. except IndexError:
  202. pass
  203. # ************************************* ORGANIZATION ********************************************************************
  204. def organisation():
  205. print('organisation working ')
  206. try:
  207. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  208. '').replace(
  209. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  210. '').replace(
  211. '.com', ''))) < 4:
  212. pass
  213. else:
  214. match = str(urlfinal[0]).lower()
  215. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  216. 'https',
  217. '').replace(
  218. 'http', '').replace(":", "").replace("/", "").upper()
  219. print(match)
  220. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  221. '') + " /" + \
  222. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  223. s1 = s1g.upper()
  224. s2 = match.upper()
  225. from difflib import SequenceMatcher
  226. print(s1)
  227. print(s2)
  228. print(SequenceMatcher(None, s1, s2).ratio())
  229. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  230. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  231. final.append(
  232. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  233. '').replace(
  234. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  235. '').replace(
  236. '.com',
  237. '').replace(']', ''))
  238. else:
  239. final.append("OrganizationName--" + s2)
  240. except IndexError:
  241. try:
  242. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  243. '').replace(
  244. '"',
  245. '').replace(
  246. '.com', '').replace('.in', ''))) < 4:
  247. pass
  248. else:
  249. match = str(urlfinal[0]).lower()
  250. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  251. '').replace(
  252. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  253. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  254. s1 = s1g.upper()
  255. s2 = match.upper()
  256. from difflib import SequenceMatcher
  257. print(s1)
  258. print(s2)
  259. print(SequenceMatcher(None, s1, s2).ratio())
  260. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  261. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  262. final.append(
  263. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
  264. '[',
  265. '').replace(
  266. ']', '').replace(
  267. '.com', ''))
  268. else:
  269. final.append("OrganizationName--" + s2)
  270. except IndexError:
  271. try:
  272. match = str(urlfinal[0]).lower()
  273. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  274. '').upper()
  275. final.append("OrganizationName--" + match)
  276. # remove_list.append(match)
  277. except IndexError:
  278. company()
  279. #################################################company Name########################################
  280. def company():
  281. print('company list working')
  282. import re
  283. new = []
  284. with open('test.txt', 'r+') as f:
  285. flag = False
  286. for line in f:
  287. line = line.upper()
  288. matches = re.findall(
  289. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  290. line)
  291. for i in matches:
  292. if i in line:
  293. flag = True
  294. if flag:
  295. o = "OrganizationName--" + line
  296. new.append(o)
  297. # if line.startswith('\n'):
  298. # flag = False
  299. try:
  300. a = new[0].replace('\n', '')
  301. final.append(a)
  302. except IndexError:
  303. final.append("OrganizationName--")
  304. # ************************************* CONTACT PERSON *******************************************************************
  305. def contactpersonname():
  306. print('contactpersonname working')
  307. try:
  308. final.append(
  309. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
  310. "]",
  311. "") + '/' +
  312. PErsons[
  313. 1].replace(":PER", "").replace('"', ''))
  314. except IndexError:
  315. try:
  316. final.append(
  317. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
  318. "").replace(
  319. '"', ''))
  320. except IndexError:
  321. final.append("CONTACTPERSONNAME--")
  322. def image_to_text():
  323. # doc = DocumentFile.from_images(found)
  324. # result = model(doc)
  325. # image_to_text.txt = result.render()
  326. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  327. # img = Image.open(found)
  328. # text = tess.image_to_string(img)
  329. # image_to_text.txt = text
  330. # print(text)
  331. import cv2
  332. img_path = found
  333. img = cv2.imread(img_path)
  334. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  335. cv2.imwrite(str(found), img)
  336. result = ocr.ocr(img_path, cls=True)
  337. result = result[0]
  338. txts = [line[1][0] for line in result]
  339. image_to_text.txt = ""
  340. for i in txts:
  341. if len(i) < 4:
  342. continue
  343. # print(i+"\n")
  344. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  345. # print(image_to_text.txt)
  346. def pdf_to_text():
  347. from pdfminer.high_level import extract_text
  348. pdf_to_text.txt = extract_text(found)
  349. # pdf_to_text.txt= text.replace('\n', ' ')
  350. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  351. if extension in extensionlist:
  352. print('image' + extension)
  353. image_to_text()
  354. x = image_to_text.txt
  355. else:
  356. print('pdf' + extension)
  357. pdf_to_text()
  358. x = pdf_to_text.txt
  359. verticaltext = x
  360. htext = x
  361. # print('------------------------------------------------')
  362. # print('############################################################# this is verticaltext #################################################################')
  363. print(verticaltext)
  364. htext = htext.replace('\n', ' ')
  365. # print('############################################################# this is htext #############################################################')
  366. # print(htext)
  367. y = x.replace('\n', ',')
  368. y = y.replace(' ', ' ')
  369. # y = y.replace(".", " .")
  370. horizontaltext = y
  371. # print('------------------------------------------------')
  372. # print('############################################################# this is horizontaltext #############################################################')
  373. # print(horizontaltext)
  374. textfile = open("test123456.txt", "w")
  375. a = textfile.write(verticaltext)
  376. textfile.close()
  377. textfile = open("vtext.txt", "w")
  378. a = textfile.write(horizontaltext)
  379. textfile.close()
  380. with open('test123456.txt', 'r') as f:
  381. with open('test.txt', 'w') as w:
  382. for line in f:
  383. if line.strip().replace('|', ''):
  384. w.write(line)
  385. ###########################ADDRESS##################################
  386. addrespinlst = []
  387. def splitaddress():
  388. import re
  389. textaddress = htext.replace('\n', ' ')
  390. # print(textaddress)
  391. address1 = (textaddress.partition(",")[0])
  392. words = address1.split()
  393. address1 = words[-1]
  394. addre = (htext.partition(",")[2])
  395. a = addre.replace('\n', ' ').replace('\x0c', '')
  396. addre = (a.partition(",")[2])
  397. matches = re.findall(
  398. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  399. a)
  400. for match in matches:
  401. address2 = match
  402. address2 = str(address2)
  403. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
  404. '')
  405. matches = re.findall(r'-\d{6}\b|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  406. for address3 in matches:
  407. pass
  408. try:
  409. Address = address1 + "," + address2 + "," + address3
  410. final.append('ADDRESS--' + Address)
  411. addrespinlst.append(Address)
  412. except NameError:
  413. final.append('ADDRESS--')
  414. # print('############################################################ Addressmodelworking #############################################################')
  415. # doc = nlp_model1(textaddress)
  416. # addlist = []
  417. # for ent in doc.ents:
  418. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  419. # addlist.append(name)
  420. # try:
  421. # Address = addlist[0]
  422. # final.append(Address)
  423. # addrespinlst.append(Address)
  424. # remove_list.append(
  425. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  426. # "ADDRESS--",
  427. # ""))
  428. # except IndexError:
  429. # final.append("ADDRESS--")
  430. pass
  431. ################################################## website#######################################################
  432. # import re
  433. # url = []
  434. # matches = re.findall(r'www.*', verticaltext)
  435. # for match in matches:
  436. # if (match.count('.')) == 1:
  437. # a_string1 = match.replace("www", "www.")
  438. # final.append("Urls--" + a_string1)
  439. # url.append(a_string1)
  440. # else:
  441. # final.append("Urls--" + match)
  442. # if len(url)==0:
  443. # from urlextract import URLExtract
  444. # extractor = URLExtract()
  445. # urls = extractor.find_urls(verticaltext)
  446. # try:
  447. # urllist = urls[0]
  448. # final.append("Urls--"+urllist)
  449. # url.append(urllist)
  450. # except IndexError:
  451. # final.append("Urls--")
  452. # for match in matches:
  453. # if (match.count('.')) == 1:
  454. # a_string1 = match.replace("www", "www.")
  455. # final.append("Urls--" + a_string1)
  456. # url.append(a_string1)
  457. # else:
  458. # final.append("Urls--" + match)
  459. # url.append(match)
  460. # remove_list.append(match)
  461. # else:
  462. # final.append("Urls--" )
  463. ################################################## website#######################################################
  464. import re
  465. # final=[]
  466. url = []
  467. urlfinal = []
  468. matches = re.findall(r'www.*', verticaltext)
  469. for match in matches:
  470. if (match.count('.')) == 1:
  471. a_string1 = match.replace("www", "www.")
  472. # final.append("Urls--" + a_string1)
  473. url.append(a_string1)
  474. else:
  475. url.append(match)
  476. if len(url) == 0:
  477. from urlextract import URLExtract
  478. extractor = URLExtract()
  479. urls = extractor.find_urls(verticaltext)
  480. try:
  481. urllist = urls[0]
  482. url.append(urllist)
  483. url.append(urllist)
  484. except IndexError:
  485. pass
  486. for match in matches:
  487. if (match.count('.')) == 1:
  488. a_string1 = match.replace("www", "www.")
  489. url.append(a_string1)
  490. # url.append(a_string1)
  491. else:
  492. url.append(match)
  493. url.append(match)
  494. else:
  495. pass
  496. try:
  497. test_string = url[0]
  498. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  499. res = [ele for ele in test_list if (ele in test_string)]
  500. if len(res) == 0:
  501. print('no match')
  502. final.append('urls--')
  503. else:
  504. print('matched')
  505. final.append('urls--' + url[0])
  506. urlfinal.append(url[0])
  507. except IndexError:
  508. final.append('urls--')
  509. print(
  510. '############################################################# url #############################################################')
  511. print(url)
  512. #######organisation and contact################
  513. # def company_url():
  514. # # print('--url--')
  515. # # print(url)
  516. # try:
  517. # match = str(url[0]).lower()
  518. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  519. # final.append("OrganizationName--" + match)
  520. # # remove_list.append(match)
  521. # except IndexError:
  522. # org_name()
  523. # organisation()
  524. # final.append("OrganizationName--")
  525. # make example sentence
  526. # print(horizontaltext)
  527. sentence = Sentence(verticaltext)
  528. # predict NER tags
  529. tagger.predict(sentence)
  530. # print sentence
  531. ko = (sentence)
  532. ko1 = str(ko).split("→")
  533. import pandas as pd
  534. dfg = []
  535. try:
  536. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  537. except IndexError:
  538. os.remove(found)
  539. return 'Invalid image'
  540. dfg.append(s)
  541. df = pd.DataFrame(dfg)
  542. df = df[0]
  543. df.to_csv("df.csv", index=False)
  544. df1 = pd.read_csv("df.csv")
  545. ve = df1["0"].str.split(",")
  546. fgf = ve.to_list()
  547. dfgh = pd.DataFrame(fgf[0])
  548. maindf = dfgh[0] # .str.split(":")
  549. # maindf.to_csv("main.csv")
  550. main1 = maindf.to_list()
  551. main1
  552. # cv=pd.DataFrame(ve)
  553. # cv
  554. per = ["PER"]
  555. org = ["ORG"]
  556. loc = ["LOC"]
  557. organizations = [i for i in main1 for j in org if j in i]
  558. PErsons = [i for i in main1 for j in per if j in i]
  559. location = [i for i in main1 for j in loc if j in i]
  560. # ************************************* ORGANIZATION ********************************************************************
  561. try:
  562. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  563. '').replace(
  564. ']', '').replace(
  565. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  566. pass
  567. # company_url()
  568. else:
  569. match = str(urlfinal[0]).lower()
  570. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  571. 'https',
  572. '').replace(
  573. 'http', '').replace(":", "").replace("/", "").upper()
  574. print(match)
  575. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  576. '.com', '') + " /" + \
  577. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  578. s1 = s1g.upper()
  579. s2 = match.upper()
  580. from difflib import SequenceMatcher
  581. print(s1)
  582. print(s2)
  583. print(SequenceMatcher(None, s1, s2).ratio())
  584. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  585. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  586. final.append(
  587. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  588. '').replace(
  589. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  590. '').replace(
  591. '.com', '').replace(']', ''))
  592. else:
  593. final.append("OrganizationName--" + s2)
  594. except IndexError:
  595. try:
  596. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  597. '').replace(
  598. '"',
  599. '').replace(
  600. '.com', ''))) < 4:
  601. pass
  602. # company_url()
  603. else:
  604. match = str(urlfinal[0]).lower()
  605. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  606. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  607. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
  608. '').replace(
  609. '.com', '')
  610. s1 = s1g.upper()
  611. s2 = match.upper()
  612. from difflib import SequenceMatcher
  613. print(s1)
  614. print(s2)
  615. print(SequenceMatcher(None, s1, s2).ratio())
  616. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  617. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  618. final.append(
  619. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  620. '').replace(
  621. ']', '').replace(
  622. '.com', '').replace(']', ''))
  623. else:
  624. final.append("OrganizationName--" + s2)
  625. except IndexError:
  626. company()
  627. # org_name()
  628. # organisation()
  629. # final.append("OrganizationName--")
  630. ################################################### Email######################################################
  631. import re
  632. from email_scraper import scrape_emails
  633. s = list(scrape_emails(horizontaltext))
  634. email_id1 = s
  635. import re
  636. email_id=[]
  637. # Define a function to extract email addresses from a text
  638. def extract_emails(text):
  639. email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
  640. return re.findall(email_pattern, text)
  641. # List of text strings
  642. # Iterate through the list and extract email addresses from each value
  643. for text in email_id1 :
  644. email_addresses = extract_emails(text)
  645. # Print the extracted email addresses
  646. if email_addresses:
  647. # print("Email addresses in the text:")
  648. for email in email_addresses:
  649. #print(email)
  650. email_id.append(email)
  651. else:
  652. print("No email addresses found in the text.")
  653. # Remove "email" if it exists within square brackets
  654. email_id = [item.replace("email", "").replace("Email", "").replace("E-mail", "") for item in email_id]
  655. # ************************************* CONTACT PERSON *******************************************************************
  656. try:
  657. my_string='Hello'
  658. print(my_string[-6])
  659. # final.append(
  660. # "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  661. # "") +
  662. # PErsons[
  663. # 1].replace(":PER", "").replace('"', ''))+PErsons[2].replace(":PER", "").replace("[", "").replace('"', '').replace("]","")
  664. except IndexError:
  665. try:
  666. final.append(
  667. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  668. '"', ''))
  669. person_name=PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace('"', '').replace(' ','')
  670. if not email_id:
  671. final.append('ContactEmail--')
  672. final.append('OrganizationEmail--')
  673. else:
  674. per_Name=[]
  675. per_Name.append(person_name)
  676. print(email_id)
  677. def calculate_matching_percentage(word_list, words):
  678. def calculate_single_matching_percentage(word, item):
  679. max_length = max(len(word), len(item))
  680. word = word.upper()
  681. item = item.strip().replace(" ", "").upper()
  682. matching_chars = sum(1 for c1, c2 in zip(item, word) if c1 == c2)
  683. return (matching_chars / max_length) * 100
  684. highest_percentage = 0.0
  685. highest_matching_item = None
  686. for word in words:
  687. word = word.upper()
  688. for item in word_list:
  689. original_item = item
  690. item = item.strip().replace(" ", "").upper()
  691. matching_percentage = calculate_single_matching_percentage(word, item)
  692. if matching_percentage > highest_percentage:
  693. highest_percentage = matching_percentage
  694. highest_matching_item = original_item
  695. return highest_matching_item, highest_percentage
  696. word_list = email_id
  697. per_Name = [item.split('.')[1] if '.' in item else item for item in per_Name]
  698. print(per_Name)
  699. word2 = per_Name
  700. for word in word2:
  701. highest_matching_item, highest_percentage = calculate_matching_percentage(word_list, [word])
  702. if highest_matching_item is not None:
  703. print(
  704. f"For '{word}', the highest matching percentage is {highest_percentage:.2f}% with '{highest_matching_item}'")
  705. else:
  706. print(f"For '{word}', no matches found.")
  707. #final.append('OrganistaionEmail--' + email_id[0])
  708. if len(word_list) == 1:
  709. if highest_percentage >= 15:
  710. print(highest_matching_item)
  711. final.append(
  712. 'ContactEmail--' + str(highest_matching_item).replace("[", "").replace("]", "").replace(
  713. "\\n", "").replace("'", ""))
  714. final.append('OrganizationEmail--')
  715. else:
  716. print('not matched')
  717. final.append('OrganistaionEmail--' + email_id[0])
  718. final.append('ContactEmail--')
  719. else:
  720. print('it as more elemnt')
  721. if highest_percentage >= 15:
  722. print(highest_matching_item)
  723. final.append('ContactEmail--' + str(highest_matching_item).replace("[", "").replace("]", "").replace("\\n", "").replace("'", ""))
  724. # Given list of email addresses
  725. email_list = word_list
  726. # Email address to remove
  727. email_to_remove = highest_matching_item
  728. # Check if the email address is in the list before removing it
  729. if email_to_remove in email_list:
  730. email_list.remove(email_to_remove)
  731. print(f"'{email_to_remove}' has been removed from the list.")
  732. else:
  733. print(f"'{email_to_remove}' is not in the list.")
  734. # Print the updated list
  735. print("Updated email list:", email_list)
  736. final.append('OrganistaionEmail--' + str(email_list[0]).replace("[", "").replace("]", "").replace("\\n","").replace("'", ""))
  737. else:
  738. final.append('OrganistaionEmail--' + str(email_id[0]) +','+ str(email_id[1]))
  739. except IndexError:
  740. # org_name()
  741. # contactpersonname()
  742. final.append("CONTACTPERSONNAME--")
  743. if len(email_id) > 1:
  744. final.append(
  745. 'OrganizationEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  746. ""))
  747. final.append(
  748. 'ContactEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  749. "'",
  750. ""))
  751. else:
  752. try:
  753. final.append(
  754. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  755. "'",
  756. ""))
  757. final.append('OrganizationEmail--')
  758. except IndexError:
  759. final.append('ContactEmail--')
  760. final.append('OrganizationEmail--')
  761. ###############address flair#####################
  762. try:
  763. print(
  764. '############################################################# address new code #############################################################')
  765. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  766. loclst = [i for i in loactionlst if i in htext.lower()]
  767. textaddress = htext
  768. textaddress = textaddress.replace("|", ",")
  769. textaddress = textaddress.lower()
  770. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  771. grop = nlp(textaddress)
  772. citycountry = []
  773. print('########################### city or country name ###########################')
  774. d = grop[-1]
  775. if d['entity_group'] == "COUNTRY":
  776. print(d["word"])
  777. citycountry.append(d["word"])
  778. elif d['entity_group'] == "CITY":
  779. print(d["word"])
  780. citycountry.append(d["word"])
  781. try:
  782. address1 = loclst[0]
  783. except IndexError:
  784. address1 = (textaddress.partition(",")[0])
  785. words = address1.split()
  786. address1 = words[-1]
  787. star_location = address1.lower()
  788. end_location = citycountry[0].replace("#", "")
  789. start = star_location
  790. end = end_location
  791. s = textaddress.lower()
  792. middle_address = (s.split(start))[-1].split(end)[0]
  793. Address = start + middle_address + end
  794. Address = Address.replace('--', '').title()
  795. print(Address)
  796. if Address.count(',') < 2:
  797. splitaddress()
  798. else:
  799. final.append('ADDRESS--' + Address)
  800. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  801. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  802. # d1 = star_location.split()
  803. # d2 = end_location.split()
  804. # d3 = d1[0]
  805. # d4 = d2[0]
  806. # start = d3
  807. # end = d4
  808. # s = horizontaltext
  809. # middle_address = ((s.split(start))[1].split(end)[0])
  810. # Address = d3 + middle_address + d4
  811. # final.append('ADDRESS--' + Address)
  812. # addrespinlst.append(Address)
  813. except IndexError:
  814. splitaddress()
  815. ########################################## Designation ###########################################
  816. import re
  817. new = []
  818. with open('test.txt', 'r') as f:
  819. flag = False
  820. for line in f:
  821. line1 = line
  822. line = line.upper()
  823. matches = re.findall(
  824. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  825. line)
  826. for match in matches:
  827. line = line.replace('-', '')
  828. # print(line)
  829. o = "Designation--" + line
  830. new.append(o)
  831. remove_list.append(str(line1).replace('\n', ''))
  832. try:
  833. a = new[0].replace('\n', '')
  834. final.append(a)
  835. except IndexError:
  836. final.append("Designation--")
  837. ###################################################Phone number#################################################
  838. num = []
  839. import phonenumbers
  840. # print(verticaltext)
  841. numbers = phonenumbers.PhoneNumberMatcher(
  842. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', '').replace('-',
  843. '').replace(
  844. ' ', ''), "IN")
  845. for number in numbers:
  846. number = str(number).split(")")
  847. num.append(number[1])
  848. # num.append(number[-1])
  849. print(num)
  850. import re
  851. # Input list of strings
  852. # num =[' 7227906777Extn1204634444']
  853. # Define a regular expression pattern to split when text is present
  854. pattern = r'[a-zA-Z]+'
  855. # Function to split a string based on the pattern
  856. def split_string(text):
  857. return re.split(pattern, text)
  858. # Process each line in the list
  859. split_lines = [split_string(line) for line in num]
  860. # Flatten the list of lists into a single list
  861. split_lines = [item for sublist in split_lines for item in sublist]
  862. # Remove any empty strings
  863. num = [item for item in split_lines if item]
  864. # Print the split lines
  865. print(num)
  866. if len(num) == 0:
  867. final.append("ContactNumber--")
  868. final.append("OrganizationNumber--")
  869. elif len(num) > 1:
  870. final.append("ContactNumber--" + num[0].replace(' ', ''))
  871. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  872. elif len(num) == 1:
  873. try:
  874. final.append("ContactNumber--" + num[0].replace(' ', ''))
  875. final.append("OrganizationNumber--")
  876. except IndexError:
  877. final.append("ContactNumber--")
  878. final.append("OrganizationNumber--")
  879. print(
  880. '############################################################# num #############################################################')
  881. print(num)
  882. # try:
  883. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  884. # remove_list.append(num[0])
  885. # except IndexError:
  886. # pass
  887. # try:
  888. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  889. # remove_list.append(num[1])
  890. # except IndexError:
  891. # pass
  892. # try:
  893. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  894. # remove_list.append(num[2])
  895. # except IndexError:
  896. # pass
  897. ###############PINCODE############
  898. pinlst = []
  899. print(addrespinlst)
  900. import pgeocode
  901. # try:
  902. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  903. # for i in matche1:
  904. # address3 = i.replace(' ', '').replace('-', '')
  905. # pinlst.append(address3)
  906. # except IndexError:
  907. lst = []
  908. for i in num:
  909. i = i[1:]
  910. lst.append(i)
  911. infile = r"vtext.txt"
  912. outfile = r"cleaned_file.txt"
  913. import glob
  914. delete_list = lst
  915. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  916. fin = open(infile, "r+")
  917. fout = open(outfile, "w+")
  918. for line12 in fin:
  919. for word in delete_list:
  920. line12 = line12.replace(word, "")
  921. fout.write(line12)
  922. fin.close()
  923. # print(line)
  924. # print(addrespinlst)
  925. import pgeocode
  926. # print(line12)
  927. import re
  928. matche1 = re.findall(r'-\d{6}\b|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  929. for i in matche1:
  930. address3 = i.replace(' ', '').replace('-', '')
  931. pinlst.append(address3)
  932. nomi = pgeocode.Nominatim('IN')
  933. try:
  934. a = nomi.query_postal_code(str(pinlst[-1]))
  935. # print(a)
  936. b = a.keys()
  937. c = b.values.tolist()
  938. d = a.tolist()
  939. postal_code = "PinCode1" + "--" + d[0]
  940. final.append(postal_code)
  941. country_code = c[1] + "--" + str(d[1])
  942. final.append(country_code)
  943. place_name = 'LandMark1' + "--" + str(d[2])
  944. final.append(place_name)
  945. state_name = c[3] + "--" + str(d[3])
  946. final.append(state_name)
  947. state_code = c[4] + "--" + str(d[4])
  948. final.append(state_code)
  949. county_name = 'CityName1' + "--" + str(d[5])
  950. final.append(county_name)
  951. except (IndexError, NameError):
  952. final.append("PinCode1--" + " ")
  953. final.append("country_code--")
  954. final.append("LandMark1--")
  955. final.append("state_name--")
  956. final.append("state_code--")
  957. final.append("CityName1--")
  958. ######################################################## json #####################################################################
  959. import pandas as pd
  960. df = pd.DataFrame(final)
  961. df1 = df[0].str.split('--', expand=True)
  962. # print(df1)
  963. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  964. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  965. df1['Keys'] = df1['Keys'].str.strip()
  966. df1.to_csv('path123.csv', index=False)
  967. df2 = pd.read_csv('path123.csv')
  968. print(df2)
  969. if df2['Values'].isnull().all():
  970. print("Column 'Column2' is empty.")
  971. return 'Invalid image'
  972. else:
  973. pass
  974. df2 = df2.T
  975. df2.to_csv('path1.csv', index=False, header=False)
  976. df1 = pd.read_csv('path1.csv')
  977. df1.to_json('firstjson1.json', orient="index")
  978. import json
  979. with open('firstjson1.json', 'r') as json_file:
  980. json_load = json.load(json_file)
  981. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  982. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  983. # # print('--------------------------------------------------------------------------')
  984. # # print(nothing)
  985. empty = []
  986. import base64
  987. name = found
  988. image = open(name, 'rb')
  989. image_read = image.read()
  990. image_64_encode = base64.b64encode(image_read)
  991. NULL = 'null'
  992. empty.append("ByteData--" + (NULL).strip('""'))
  993. image_64_encode = image_64_encode.decode('utf-8')
  994. empty.append("FileData--" + str(image_64_encode))
  995. imagedata = name.split("/")
  996. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  997. imagename1 = str(imagename).split('.')
  998. imagename = str(imagename1[-2]).replace("[", "]")
  999. empty.append("FileName--" + imagename)
  1000. empty.append("FilePath--" + "")
  1001. imageExtension = str(imagename1[-1]).replace("[", "]")
  1002. empty.append("FileType--" + imageExtension)
  1003. image.close()
  1004. import pandas as pd
  1005. df = pd.DataFrame(empty)
  1006. df = df[0].str.split("--", expand=True)
  1007. data1 = pd.DataFrame(df[0])
  1008. data2 = pd.DataFrame(df[1])
  1009. dt = data2.set_index(data1[0])
  1010. dt4 = dt.T
  1011. dictionary = dt4.to_dict(orient="index")
  1012. list1 = []
  1013. # list.append(a)
  1014. list1.append(dictionary[1])
  1015. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  1016. print('--------------------')
  1017. # print(namelist)
  1018. import json
  1019. # JSON data:
  1020. x = nothing
  1021. # python object to be appended
  1022. y = {"image": dictionary[1]}
  1023. # parsing JSON string:
  1024. z = json.loads(x)
  1025. # appending the data
  1026. z.update(y)
  1027. # the result is a JSON string:
  1028. # print(json.dumps(z))
  1029. zlist.append(z)
  1030. #############################################creating csv#####################################
  1031. # print(final)
  1032. # print(imagelist)
  1033. # final.append('image--' + str(imagelist))
  1034. # import requests
  1035. # import json
  1036. # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
  1037. # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing
  1038. # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
  1039. # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
  1040. # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  1041. # payload1 = json.dumps(zlist)
  1042. # # print('--------------------------------------------------------------------------')
  1043. # #print(payload1)
  1044. # headers = {
  1045. # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
  1046. # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
  1047. # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
  1048. # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  1049. # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
  1050. # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af',
  1051. # 'Content-Type': 'application/json'
  1052. # }
  1053. # response = requests.request("POST", url, headers=headers, data=payload1)
  1054. # # print("##############################################################")
  1055. # print(payload1)
  1056. # #print(zlist)
  1057. # # import os
  1058. # # if 'BusinessCards Created Successfully' in response.text:
  1059. # # print('present')
  1060. # # os.remove(found)
  1061. # # else:
  1062. # # print('not present')
  1063. # df1.to_json('visitingcard.json')
  1064. # data = df1.to_json('visiting.json', orient='records')
  1065. # print(data)
  1066. # return render_template('index.html')
  1067. # return response.text
  1068. return z
  1069. # return zlist
  1070. # @app.route('/upload_BusinessCards', methods=["POST"])
  1071. # def mainfunction():
  1072. # Dataset = request.get_json()
  1073. # if len(Dataset)==1:
  1074. # # predict(Dataset)
  1075. # return multiplecards(Dataset)
  1076. # else:
  1077. # # multiplecards(Dataset)
  1078. # return multiplecards(Dataset)
  1079. ################################################################################### Resume parser ###################################################################################################
  1080. @app.route("/upload_resume", methods=["POST"])
  1081. def predict_resume():
  1082. Dataset = request.get_json()
  1083. # data = {'visiting': Dataset}
  1084. # a=url_list[0]
  1085. a = Dataset
  1086. # a = url_list
  1087. # print(a)
  1088. x = a['FileData']
  1089. # print(x)
  1090. y = a['FileName']
  1091. y = y.replace(' ', '')
  1092. y = y.replace('&', '')
  1093. y = y.replace('@', '')
  1094. z = a['FileType']
  1095. # CreatedBy=a['CreatedBy']
  1096. name = y + '.' + z
  1097. print(name)
  1098. # img_data = x.encode()
  1099. img_data = x.encode()
  1100. import base64
  1101. with open('./Resume_parser/upload_resume/' + name, "wb") as fh:
  1102. fh.write(base64.decodebytes(img_data))
  1103. # cmd = "python ./Resume_parser/resume1.0.multiprocessing.py" + " " + str('./Resume_parser/upload_resume/' + name)
  1104. # os.system(cmd)
  1105. # f = "./resume_upload"
  1106. # f = os.listdir(f)
  1107. f = './Resume_parser/upload_resume/' + name
  1108. found = './Resume_parser/upload_resume/' + name
  1109. print('this from resumepy file')
  1110. print(f)
  1111. def docx_to_txt():
  1112. import docx2txt
  1113. import glob
  1114. text = ''
  1115. for file in glob.glob(found):
  1116. c = docx2txt.process(file)
  1117. c = c.rstrip("\n")
  1118. toPrint = c
  1119. d = ' '.join(i for i in toPrint.split())
  1120. d = d.rstrip()
  1121. text += d
  1122. docx_to_txt.text = text
  1123. def doc_to_txt():
  1124. import docx2txt
  1125. import glob
  1126. text = ''
  1127. # for file in glob.glob(found):
  1128. c = docx2txt.process(f)
  1129. c = c.rstrip("\n")
  1130. toPrint = c
  1131. d = ' '.join(i for i in toPrint.split())
  1132. d = d.rstrip()
  1133. text += d
  1134. doc_to_txt.text = text
  1135. def pdf_to_txt():
  1136. import sys
  1137. import fitz
  1138. fname = found
  1139. doc = fitz.open(fname)
  1140. text = ""
  1141. for page in doc:
  1142. text = text + str(page.get_text())
  1143. pdf_to_txt.text = " ".join(text.split('\n'))
  1144. # for file in f:
  1145. print('checking for filetype')
  1146. if f.endswith('.doc'):
  1147. doc_to_txt()
  1148. x = doc_to_txt.text
  1149. elif f.endswith('.docx'):
  1150. docx_to_txt()
  1151. x = docx_to_txt.text
  1152. elif f.endswith('.pdf'):
  1153. pdf_to_txt()
  1154. x = pdf_to_txt.text
  1155. doc = nlp_model(x)
  1156. k = []
  1157. l = []
  1158. for ent in doc.ents:
  1159. # print(f'{ent.label_.upper():{30}}- {ent.text}')
  1160. k.append(ent.label_.upper())
  1161. l.append(ent.text)
  1162. columns = k
  1163. rows = [l]
  1164. import pandas as pd
  1165. data = pd.DataFrame(rows, columns=columns)
  1166. df = data
  1167. data = df.T
  1168. data.to_csv('./Resume_parser/Ad1.csv', index=True)
  1169. data = pd.read_csv('./Resume_parser/Ad1.csv')
  1170. # print(data)
  1171. data.rename({data.columns[-2]: 'Key'}, axis=1, inplace=True)
  1172. data.rename({data.columns[-1]: 'Values'}, axis=1, inplace=True)
  1173. data.to_csv('./Resume_parser/Ad1.csv', index=False)
  1174. #####################################################################################################
  1175. # ModelName = "text-davinci-003"
  1176. # prompt_value = 'find designation in key value pairs from below text?' + "/n" + str(x)
  1177. # max_token_value = 300
  1178. # # usertext= request.get_data()
  1179. # # output = usertext.decode()
  1180. # # print(output)
  1181. # import os
  1182. # import openai
  1183. # # print(usertext)
  1184. # openai.api_key = "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"
  1185. # # userinput='fibonacci series in python'
  1186. # import os
  1187. # import openai
  1188. # # openai.api_key = os.getenv("OPENAI_API_KEY")
  1189. # response_text = openai.Completion.create(
  1190. # model=ModelName,
  1191. # prompt=prompt_value,
  1192. # temperature=0,
  1193. # max_tokens=max_token_value,
  1194. # top_p=1,
  1195. # frequency_penalty=0,
  1196. # presence_penalty=0,
  1197. # stop=["\"\"\""]
  1198. # )
  1199. # a = response_text['choices']
  1200. # data = a[0]['text']
  1201. # data=data.replace('\n','$@$')
  1202. # data=data.replace('$@$$@$','')
  1203. # #data=data.replace(':','')
  1204. # print(data)
  1205. # data=data.replace('Designation','POSITION')
  1206. # data=data.split('$@$')
  1207. # print(data)
  1208. # import pandas as pd
  1209. # desgnaition=pd.DataFrame(data)
  1210. # desgnaition=desgnaition[0].str.split(':',expand=True)
  1211. # desgnaition.columns=['Key','Values']
  1212. # print(desgnaition)
  1213. # data= pd.read_csv('./Resume_parser/Ad1.csv')
  1214. # frames = [data,desgnaition]
  1215. # result = pd.concat(frames,axis=0)
  1216. # result.to_csv('./Resume_parser/Ad1.csv', index=False)
  1217. ########################################################################################################
  1218. # df2 = pd.read_csv('./Ad1.csv')
  1219. x1 = pd.read_csv('D:/projects/C01app/Resume_parser/AD11.csv')
  1220. tp = pd.read_csv('./Resume_parser/Ad1.csv')
  1221. # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')]
  1222. merge = pd.merge(tp, x1, on='Key', how='right')
  1223. merge.to_csv('./Resume_parser/AD.csv', index=False)
  1224. df2 = pd.read_csv('./Resume_parser/AD.csv')
  1225. # print(df2)
  1226. df2 = df2.T
  1227. df2.to_csv('./Resume_parser/path.csv', index=False, header=False)
  1228. df1 = pd.read_csv('./Resume_parser/path.csv')
  1229. df1.to_json('./Resume_parser/firstjson.json', orient="index")
  1230. print(df1)
  1231. doc = nlp_model1(x)
  1232. k = []
  1233. l = []
  1234. for ent in doc.ents:
  1235. # print(f'{ent.label_.upper():{30}}- {ent.text}')
  1236. k.append(ent.label_.upper())
  1237. l.append(ent.text)
  1238. columns = k
  1239. rows = [l]
  1240. data = pd.DataFrame(rows, columns=columns)
  1241. df = data
  1242. data = df.T
  1243. data.to_csv('./Resume_parser/Ad2.csv', index=True)
  1244. data = pd.read_csv('./Resume_parser/Ad2.csv')
  1245. data.rename({data.columns[-2]: 'Key'}, axis=1, inplace=True)
  1246. data.rename({data.columns[-1]: 'Values'}, axis=1, inplace=True)
  1247. data.to_csv('./Resume_parser/Ad2.csv', index=False)
  1248. import pandas as pd
  1249. import json
  1250. dflist = []
  1251. x = pd.read_csv('D:/projects/C01app/Resume_parser/PG.csv')
  1252. tp = pd.read_csv('./Resume_parser/Ad2.csv')
  1253. # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')]
  1254. merge = pd.merge(x, tp, on='Key', how='left')
  1255. import numpy as np
  1256. merge = merge.replace(np.nan, '', regex=True)
  1257. merge.to_csv('./Resume_parser/PGmerge.csv', index=False)
  1258. dfPG = pd.read_csv('./Resume_parser/PGmerge.csv')
  1259. import numpy as np
  1260. dfPG = dfPG.replace({np.nan: None})
  1261. x2 = dfPG.iloc[:, -2].tolist()
  1262. y2 = dfPG.iloc[:, -1].tolist()
  1263. z1 = dict(zip(x2, y2))
  1264. dflist.append(z1)
  1265. # u1 = json.dumps(z1)
  1266. import pandas as pd
  1267. x = pd.read_csv('D:/projects/C01app/Resume_parser/UG.csv')
  1268. tp = pd.read_csv('./Resume_parser/Ad2.csv')
  1269. # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')]
  1270. merge = pd.merge(x, tp, on='Key', how='left')
  1271. import numpy as np
  1272. merge = merge.replace(np.nan, '', regex=True)
  1273. merge.to_csv('./Resume_parser/UGmerge.csv', index=False)
  1274. dfUG = pd.read_csv('./Resume_parser/UGmerge.csv')
  1275. import numpy as np
  1276. dfUG = dfUG.replace({np.nan: None})
  1277. x2 = dfUG.iloc[:, -2].tolist()
  1278. y2 = dfUG.iloc[:, -1].tolist()
  1279. z2 = dict(zip(x2, y2))
  1280. dflist.append(z2)
  1281. # u2 = json.dumps(z2)
  1282. # final = '[' + str(z1) + ',' + str(z2) + ']'
  1283. # return render_template('resume.html')
  1284. ############################################################################
  1285. import pandas as pd
  1286. x = pd.read_csv('D:/projects/C01app/Resume_parser/inter.csv')
  1287. tp = pd.read_csv('./Resume_parser/Ad2.csv')
  1288. # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')]
  1289. merge = pd.merge(x, tp, on='Key', how='left')
  1290. import numpy as np
  1291. merge = merge.replace(np.nan, '', regex=True)
  1292. merge.to_csv('./Resume_parser/intermerge.csv', index=False)
  1293. dfinter = pd.read_csv('./Resume_parser/intermerge.csv')
  1294. import numpy as np
  1295. dfinter = dfinter.replace({np.nan: None})
  1296. x2 = dfinter.iloc[:, -2].tolist()
  1297. y2 = dfinter.iloc[:, -1].tolist()
  1298. z3 = dict(zip(x2, y2))
  1299. dflist.append(z3)
  1300. ############################################################################
  1301. import pandas as pd
  1302. x = pd.read_csv('D:/projects/C01app/Resume_parser/SSC.csv')
  1303. tp = pd.read_csv('./Resume_parser/Ad2.csv')
  1304. # tp = tp.loc[:, ~tp.columns.str.contains('^Unnamed')]
  1305. merge = pd.merge(x, tp, on='Key', how='left')
  1306. import numpy as np
  1307. merge = merge.replace(np.nan, '', regex=True)
  1308. merge.to_csv('./Resume_parser/sscmerge.csv', index=False)
  1309. dfssc = pd.read_csv('./Resume_parser/sscmerge.csv')
  1310. import numpy as np
  1311. dfssc = dfssc.replace({np.nan: None})
  1312. x2 = dfssc.iloc[:, -2].tolist()
  1313. y2 = dfssc.iloc[:, -1].tolist()
  1314. z4 = dict(zip(x2, y2))
  1315. dflist.append(z4)
  1316. ############################################Document############################################################
  1317. import base64
  1318. empty = []
  1319. name = f
  1320. image = open(name, 'rb')
  1321. image_read = image.read()
  1322. image_64_encode = base64.b64encode(image_read)
  1323. NULL = 'null'
  1324. # empty.append("ByteData--" + (NULL).strip('""'))
  1325. image_64_encode = image_64_encode.decode('utf-8')
  1326. empty.append("FileData--" + str(image_64_encode))
  1327. imagedata = name.split("/")
  1328. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  1329. imagename1 = str(imagename).split('.')
  1330. imagename = str(imagename1[-2]).replace("[", "]")
  1331. empty.append("FileName--" + imagename)
  1332. empty.append("FilePath--" + "")
  1333. imageExtension = str(imagename1[-1]).replace("[", "]")
  1334. empty.append("FileType--" + imageExtension)
  1335. import pandas as pd
  1336. df = pd.DataFrame(empty)
  1337. df = df[0].str.split("--", expand=True)
  1338. data1 = pd.DataFrame(df[0])
  1339. data2 = pd.DataFrame(df[1])
  1340. dt = data2.set_index(data1[0])
  1341. dt4 = dt.T
  1342. list = []
  1343. dictionary = dt4.to_dict(orient="index")
  1344. a = {
  1345. "FileId": 0,
  1346. "FileData": "",
  1347. "FileName": "",
  1348. "FileType": "",
  1349. "RefId": 0
  1350. }
  1351. list = []
  1352. list.append(a)
  1353. list.append(dictionary[1])
  1354. import json
  1355. with open('./Resume_parser/firstjson.json', 'r') as json_file:
  1356. json_load = json.load(json_file)
  1357. # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  1358. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  1359. import json
  1360. # JSON data:
  1361. x = nothing
  1362. # python object to be appended
  1363. y = {"EducationDetails": dflist}
  1364. y1 = {"Document": list}
  1365. print(y)
  1366. # parsing JSON string:
  1367. z = json.loads(x)
  1368. # appending the data
  1369. z.update(y)
  1370. z.update(y1)
  1371. # the result is a JSON string:
  1372. # print(json.dumps(z))
  1373. print('##########################')
  1374. # print(z)
  1375. print('##########################')
  1376. import requests
  1377. import json
  1378. # with open('visitingcard1.json', 'r') as json_file:
  1379. # json_load = json.load(json_file)
  1380. # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/resumeparsing/save" #dev
  1381. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/resumeparsing/save"
  1382. # #url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/resumeparsing/save" #testing
  1383. # payload1 = json.dumps(z)
  1384. # print('--------------------------------------------------------------------------')
  1385. # # print(payload1)
  1386. # headers = {
  1387. # # 'Authorization': 'stat 53f27e671adf456e974f1d11ceb5db41',
  1388. # #'Authorization': 'stat 5702ce5a77d34e0381bc2f06588d9bcc',#dev
  1389. # 'Authorization': 'stat ed5dd14ee2094227849f6bbe2928bff3', #testing
  1390. # 'Content-Type': 'application/json'
  1391. # }
  1392. # response = requests.request("POST", url, headers=headers, data=payload1)
  1393. # print("##############################################################")
  1394. # print(response.text)
  1395. # function_1.var=response
  1396. # a=str(response.text)
  1397. files = glob.glob('./resume_upload/*')
  1398. for f in files:
  1399. os.remove(f)
  1400. return z
  1401. # return 'done'
  1402. # return render_template('resume.html')
  1403. # @app.route('/upload_resume', methods=["POST"])
  1404. def upload_resume():
  1405. if __name__ == "__main__":
  1406. # print(os.getpid())
  1407. url_list = []
  1408. Dataset = request.get_json()
  1409. # id = "100013660000125"
  1410. url_list.append(Dataset)
  1411. # multiprocessing
  1412. with multiprocessing.Pool(processes=1) as pool:
  1413. results = pool.map(predict_resume, url_list)
  1414. pool.close()
  1415. return results[0]
  1416. @app.route("/Download_resume")
  1417. def Download_resume():
  1418. # try:
  1419. with open("Ad1.csv", encoding="unicode_escape") as fp:
  1420. csv = fp.read()
  1421. return Response(csv, mimetype="text/csv", headers={"Content-disposition": "attachment; filename=Resume.csv"})
  1422. ############################################################################## Invoice Parser ###################################################################################################
  1423. @app.route('/upload_invoice', methods=["POST", "GET"])
  1424. def upload_invoice():
  1425. Dataset = request.get_json()
  1426. # data = {'visiting': Dataset}
  1427. # a=url_list[0]
  1428. a = Dataset
  1429. x = a['FileData']
  1430. # print(x)
  1431. y = a['FileName']
  1432. z = a['FileType']
  1433. # CreatedBy=a['CreatedBy']
  1434. name = y + '.' + z
  1435. print(name)
  1436. img_data = x.encode()
  1437. import base64
  1438. with open('./Invoice_parser/upload_invoice/' + name, "wb") as fh:
  1439. fh.write(base64.decodebytes(img_data))
  1440. # cmd = "python ./Invoice_parser/invoice.multiprocessing.py" + " " + str('./Invoice_parser/upload_invoice/' + name)
  1441. # os.system(cmd)
  1442. #####################################################################################################################################
  1443. name = './Invoice_parser/upload_invoice/' + name
  1444. extension = name.split('.')[-1]
  1445. def image_to_text():
  1446. print('####################### image-to-pdf ################')
  1447. import cv2
  1448. import numpy as np
  1449. fname = name
  1450. print(fname)
  1451. import pytesseract as tess
  1452. from PIL import Image
  1453. tess.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
  1454. img = cv2.imread(fname)
  1455. # img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
  1456. # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  1457. # kernel = np.ones((1, 1), np.uint8)
  1458. # img = cv2.dilate(img, kernel, iterations=1)
  1459. # img = cv2.erode(img, kernel, iterations=1)
  1460. # img=cv2.threshold(cv2.GaussianBlur(img, (5, 5), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
  1461. pdf = tess.image_to_pdf_or_hocr(img, extension="pdf")
  1462. with open(Current_Working_Directory + "/Invoice_parser/demo.pdf", "w+b", ) as f:
  1463. f.write(pdf)
  1464. print('demo created')
  1465. import fitz
  1466. fname = Current_Working_Directory + '/Invoice_parser/demo.pdf'
  1467. doc = fitz.open(fname)
  1468. text = ""
  1469. for page in doc:
  1470. text = text + str(page.get_text())
  1471. image_to_text.text = " ".join(text.split("\n"))
  1472. # result = ocr.ocr( Current_Working_Directory + "/Invoice_parser/demo.pdf" , cls=True)
  1473. # result = result[0]
  1474. # txts = [line[1][0] for line in result]
  1475. # image_to_text.text = ""
  1476. # for i in txts:
  1477. # if len(i) < 4:
  1478. # continue
  1479. # # print(i+"\n")
  1480. # image_to_text.text = image_to_text.text + str(i) + "\n"
  1481. def pdf_to_text():
  1482. import fitz
  1483. fname = name
  1484. doc = fitz.open(fname)
  1485. text = ""
  1486. for page in doc:
  1487. text = text + str(page.get_text())
  1488. pdf_to_text.text = " ".join(text.split("\n"))
  1489. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  1490. if extension in extensionlist:
  1491. print('image' + extension)
  1492. image_to_text()
  1493. x = image_to_text.text
  1494. else:
  1495. print('pdf' + extension)
  1496. pdf_to_text()
  1497. x = pdf_to_text.text
  1498. import spacy
  1499. import sys
  1500. # import fitz
  1501. # fname = "uploads/0.pdf"
  1502. # doc = fitz.open(fname)
  1503. # text = ""
  1504. # for page in doc:
  1505. # text = text + str(page.get_text())
  1506. # fitz = " ".join(text.split("\n"))
  1507. # # print(fitz)
  1508. import pandas as pd
  1509. doc = nlp_model1(x)
  1510. k = []
  1511. l = []
  1512. for ent in doc.ents:
  1513. # print(f"{ent.label_.upper():{30}}- {ent.text}")
  1514. k.append(ent.label_.upper())
  1515. l.append(ent.text)
  1516. columns = k
  1517. rows = [l]
  1518. data = pd.DataFrame(rows, columns=columns)
  1519. df = data
  1520. df = data.T
  1521. df.to_csv(Current_Working_Directory + "/Invoice_parser/Invoice.csv")
  1522. import pandas as pd
  1523. df = pd.read_csv(Current_Working_Directory + "/Invoice_parser/Invoice.csv")
  1524. # df.head()
  1525. # df = df.T
  1526. # new_header = df.iloc[0] # grab the first row for the header
  1527. # df = df[1:] # take the data less the header row
  1528. # df.columns = new_header
  1529. # def df_column_uniquify(df):
  1530. # df_columns = df.columns
  1531. # new_columns = []
  1532. # for item in df_columns:
  1533. # counter = 0
  1534. # newitem = item
  1535. # while newitem in new_columns:
  1536. # counter += 1
  1537. # newitem = "{}_{}".format(item, counter)
  1538. # new_columns.append(newitem)
  1539. # df.columns = new_columns
  1540. # return df.T
  1541. # df = df_column_uniquify(df)
  1542. # # df=df.T
  1543. # df.to_csv('final.csv')
  1544. # df = pd.read_csv('final.csv')
  1545. df.rename({df.columns[-2]: 'Key'}, axis=1, inplace=True)
  1546. df.rename({df.columns[-1]: 'Values'}, axis=1, inplace=True)
  1547. df['Key'] = df['Key'].str.replace('/', '')
  1548. df['Key'] = df['Key'].str.replace(' ', '')
  1549. df.to_csv(Current_Working_Directory + '/Invoice_parser/final.csv', index=False)
  1550. import pandas as pd
  1551. x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv')
  1552. tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/finalwithcolen.csv')
  1553. merge = pd.merge(x1, tp, on='Key', how='right')
  1554. merge1 = merge
  1555. merge['Values'] = merge['Values'].astype(str)
  1556. merge = merge['Values'].str.split(":", expand=True)
  1557. merge.rename({merge.columns[-1]: 'Values'}, axis=1, inplace=True)
  1558. frames = [merge1['Key'], merge['Values']]
  1559. result = pd.concat(frames, axis=1)
  1560. x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv')
  1561. tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/finalwithoutcolen.csv')
  1562. merged = pd.merge(x1, tp, on='Key', how='right')
  1563. frames = [result, merged]
  1564. result1 = pd.concat(frames)
  1565. result1.to_csv(Current_Working_Directory + '/Invoice_parser/final1.csv', index=False)
  1566. x1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/main.csv')
  1567. tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final1.csv')
  1568. # tp = pd.read_csv(Current_Working_Directory + 'Invoice_parser/final.csv')
  1569. tp['Key'] = tp['Key'].astype(str)
  1570. tp['Values'] = tp['Values'].astype(str)
  1571. tp['Key'] = tp['Key'].str.strip()
  1572. tp['Values'] = tp['Values'].str.strip()
  1573. merge = pd.merge(tp, x1, on='Key', how='right')
  1574. merge.to_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', index=False)
  1575. df2 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv')
  1576. # Import writer class from csv module
  1577. from csv import writer
  1578. List = ['PlantCode', " "]
  1579. with open(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', 'a') as f_object:
  1580. writer_object = writer(f_object)
  1581. writer_object.writerow(List)
  1582. f_object.close()
  1583. # print(df2)
  1584. df2 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv')
  1585. print(df2)
  1586. df2 = df2.T
  1587. df2.to_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv', index=False, header=False)
  1588. df1 = pd.read_csv(Current_Working_Directory + '/Invoice_parser/invoicewithouttable.csv')
  1589. df1.to_json(Current_Working_Directory + '/Invoice_parser/firstjson.json', orient="index")
  1590. import pandas as pd
  1591. x = pd.read_csv(Current_Working_Directory + '/Invoice_parser/final.csv')
  1592. tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/item1.csv')
  1593. x['Values'] = x['Values'].str.strip()
  1594. merge = pd.merge(tp, x, on='Key', how='inner')
  1595. merge = merge.groupby('Key').agg({
  1596. 'Values': '/'.join,
  1597. }).reset_index()
  1598. z = merge['Values'].str.split('/', expand=True)
  1599. frames = [merge, z]
  1600. result1 = pd.concat(frames, axis=1)
  1601. result1 = result1.drop(['Values'], axis=1)
  1602. import pandas as pd
  1603. tp = pd.read_csv(Current_Working_Directory + '/Invoice_parser/item1.csv')
  1604. merge = pd.merge(tp, result1, on='Key', how='inner')
  1605. merge = merge.T
  1606. new_header = merge.iloc[0] # grab the first row for the header
  1607. merge = merge[1:] # take the data less the header row
  1608. merge.columns = new_header
  1609. merge = merge.to_dict('records')
  1610. invoice_Item = merge
  1611. print(invoice_Item)
  1612. ####################################Document############################################################
  1613. import base64
  1614. empty = []
  1615. # name = found
  1616. image = open(name, 'rb')
  1617. image_read = image.read()
  1618. image_64_encode = base64.b64encode(image_read)
  1619. NULL = 'null'
  1620. # empty.append("ByteData--" + (NULL).strip('""'))
  1621. image_64_encode = image_64_encode.decode('utf-8')
  1622. empty.append("FileData--" + str(image_64_encode))
  1623. imagedata = name.split("/")
  1624. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  1625. imagename1 = str(imagename).split('.')
  1626. imagename = str(imagename1[-2]).replace("[", "]")
  1627. empty.append("FileName--" + imagename)
  1628. empty.append("FilePath--" + name)
  1629. imageExtension = str(imagename1[-1]).replace("[", "]")
  1630. empty.append("FileType--" + imageExtension)
  1631. import pandas as pd
  1632. df = pd.DataFrame(empty)
  1633. df = df[0].str.split("--", expand=True)
  1634. data1 = pd.DataFrame(df[0])
  1635. data2 = pd.DataFrame(df[1])
  1636. dt = data2.set_index(data1[0])
  1637. dt4 = dt.T
  1638. list = []
  1639. dictionary = dt4.to_dict(orient="index")
  1640. a = {
  1641. "FileId": 0,
  1642. "FileData": "",
  1643. "FileName": "",
  1644. "FileType": "",
  1645. "RefId": 0
  1646. }
  1647. list = []
  1648. list.append(a)
  1649. list.append(dictionary[1])
  1650. import json
  1651. with open(Current_Working_Directory + '/Invoice_parser/firstjson.json', 'r') as json_file:
  1652. json_load = json.load(json_file)
  1653. # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  1654. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  1655. import json
  1656. # JSON data:
  1657. x = nothing
  1658. # python object to be appended
  1659. y = {"InvoiceItems": invoice_Item}
  1660. y1 = {"Document": list}
  1661. # parsing JSON string:
  1662. z = json.loads(x)
  1663. # appending the data
  1664. z.update(y)
  1665. z.update(y1)
  1666. # print(z)
  1667. # the result is a JSON string:
  1668. # print(json.dumps(z))
  1669. # print('##########################')
  1670. # print(z)
  1671. # print('##########################')
  1672. # import requests
  1673. # import json
  1674. # # with open('visitingcard1.json', 'r') as json_file:
  1675. # # json_load = json.load(json_file)
  1676. # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/invoice/createsalesinvoice"
  1677. # #url="https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/invoice/createsalesinvoice"
  1678. # payload1 = json.dumps(z)
  1679. # print('--------------------------------------------------------------------------')
  1680. # print(payload1)
  1681. # headers = {
  1682. # 'Authorization': 'stat 089166c35d4c4d7d941c99d6f8986834',
  1683. # 'Content-Type': 'application/json'
  1684. # }
  1685. # response = requests.request("POST", url, headers=headers, data=payload1)
  1686. # print("##############################################################")
  1687. # print(response.text)
  1688. # import glob
  1689. # files = glob.glob(
  1690. # "upload_invoice/*"
  1691. # )
  1692. # for f in files:
  1693. # os.remove(f)
  1694. # files = glob.glob(
  1695. # "uploads/*"
  1696. # )
  1697. # for f in files:
  1698. # os.remove(f)
  1699. return z
  1700. # return render_template('invoice.html')
  1701. @app.route("/Download_invoice")
  1702. def Download_invoice():
  1703. pass
  1704. @app.route("/Table")
  1705. def Table():
  1706. pass
  1707. if __name__ == "__main__":
  1708. app.run(host='0.0.0.0', port=1112)