Ei kuvausta
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Business_cards.py 45KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175
  1. from flask import Flask, render_template, request, redirect, Response, send_file
  2. import os
  3. # import openai
  4. import requests
  5. import pandas as pd
  6. import pgeocode
  7. from email_scraper import scrape_emails
  8. import phonenumbers
  9. from pdfminer.high_level import extract_text
  10. import pytesseract
  11. import time
  12. import multiprocessing
  13. from PIL import Image
  14. from functools import partial
  15. from urlextract import URLExtract
  16. import pytesseract as tess
  17. from PIL import Image
  18. # from doctr.io import DocumentFile
  19. # from doctr.models import ocr_predictor
  20. # model = ocr_predictor(pretrained=True)
  21. # load tagger
  22. ######################################################
  23. import os
  24. import glob
  25. from pytesseract import *
  26. import shutil
  27. import cv2
  28. import matplotlib
  29. from werkzeug.utils import secure_filename
  30. import requests
  31. import spacy
  32. import time
  33. import multiprocessing
  34. from PIL import Image
  35. from functools import partial
  36. nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME")
  37. nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2")
  38. from flask import Flask, render_template, request, redirect, Response, send_file
  39. import pandas as pd
  40. ################################################################
  41. Current_Working_Directory=os.getcwd()
  42. Current_Working_Directory=Current_Working_Directory.replace("\\","/")
  43. nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")
  44. ################################################################
  45. # import spacy
  46. # nlp_model1 = spacy.load('./ADD3001.2')
  47. from flair.data import Sentence
  48. from flair.models import SequenceTagger
  49. from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
  50. tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  51. model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  52. from paddleocr import PaddleOCR, draw_ocr
  53. ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False)
  54. tagger = SequenceTagger.load("flair/ner-english-large")
  55. import datetime
  56. app = Flask(__name__)
  57. # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
  58. @app.route('/', methods=['GET'])
  59. def home():
  60. return render_template('home.html')
  61. @app.route('/resume', methods=['GET'])
  62. def resume():
  63. return render_template('resume.html')
  64. @app.route('/invoice', methods=['GET'])
  65. def invoice():
  66. return render_template('invoice.html')
  67. @app.route('/card', methods=['GET'])
  68. def card():
  69. return render_template('card.html')
  70. @app.route('/upload_BusinessCards', methods=["POST"])
  71. # @app.route('/multiplecards', methods=["POST"])
  72. def multiplecards():
  73. # print('################## multiple card detection #######################')
  74. # print(Dataset)
  75. datalist=[]
  76. zlist=[]
  77. Dataset = request.get_json()
  78. # print(data)
  79. #datalist.append(Dataset)
  80. data = {'visiting': Dataset}
  81. for i in data['visiting']:
  82. import time
  83. # time.sleep(1)
  84. a = i
  85. x = a['FileData']
  86. # print(x)
  87. y = a['FileName']
  88. z = a['FileType']
  89. # CreatedBy=a['CreatedBy']
  90. name = y + '.' + z
  91. # print(name)
  92. # print(y)
  93. # image = y.split("/")
  94. # filename=image[-1]
  95. # print(x)
  96. img_data = x.encode()
  97. import base64
  98. with open('./multicards/' + name, "wb") as fh:
  99. fh.write(base64.decodebytes(img_data))
  100. # print(i)
  101. # import os
  102. # import glob
  103. # for i in glob.glob('./multipleupload/*'):
  104. found = './multicards/' + name
  105. print(found)
  106. extension = found.split('.')[-1]
  107. # for root, dirs, fils in os.glob('./multipleupload'):
  108. # for name in files:
  109. # foundfile= os.path.join(root, name)
  110. # print(foundfile)
  111. import re
  112. import csv
  113. import glob
  114. import os
  115. # import pytesseract
  116. # import cv2
  117. import numpy as np
  118. import glob
  119. import os
  120. import cv2
  121. import requests
  122. final = []
  123. # final.append('assignto--'+CreatedBy)
  124. imagelist = []
  125. # print(found)
  126. remove_list = []
  127. import os
  128. import glob
  129. import pdfminer
  130. # import os
  131. # ts = 0
  132. # for file_name in glob.glob('./upload/*'):
  133. # fts = os.path.getmtime(file_name)
  134. # if fts > ts:
  135. # ts = fts
  136. # found = file_name
  137. # print(found)
  138. # print(extension)
  139. def org_name():
  140. print('org_name is working')
  141. import pytesseract
  142. fname = found
  143. if extension != 'pdf':
  144. img = cv2.imread(fname)
  145. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  146. cv2.imwrite(str(found), img)
  147. from PIL import Image
  148. im = Image.open(found)
  149. im.save("images1.png", dpi=(1200, 1200))
  150. # import pytesseract
  151. fname = "images1.png"
  152. import pytesseract as tess
  153. from PIL import Image
  154. tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  155. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  156. with open("demo.pdf", "w+b", ) as f:
  157. f.write(pdf)
  158. from pdfminer.high_level import extract_text
  159. text = extract_text('demo.pdf')
  160. # doc = DocumentFile.from_images(found)
  161. # result = model(doc)
  162. # text = result.render()
  163. # from pdfminer.high_level import extract_text
  164. # txt = extract_text('demo.pdf')
  165. else:
  166. from pdfminer.high_level import extract_text
  167. text = extract_text(fname)
  168. sentence = Sentence(text)
  169. # predict NER tags
  170. tagger.predict(sentence)
  171. # print sentence
  172. ko = (sentence)
  173. ko1 = str(ko).split("→")
  174. import pandas as pd
  175. dfg = []
  176. try:
  177. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  178. # os.remove(found)
  179. # return 'Invalid image'
  180. dfg.append(s)
  181. df = pd.DataFrame(dfg)
  182. df = df[0]
  183. df.to_csv("df.csv", index=False)
  184. df1 = pd.read_csv("df.csv")
  185. ve = df1["0"].str.split(",")
  186. fgf = ve.to_list()
  187. dfgh = pd.DataFrame(fgf[0])
  188. maindf = dfgh[0] # .str.split(":")
  189. # maindf.to_csv("main.csv")
  190. main1 = maindf.to_list()
  191. main1
  192. # cv=pd.DataFrame(ve)
  193. # cv
  194. per = ["PER"]
  195. org = ["ORG"]
  196. loc = ["LOC"]
  197. organizations = [i for i in main1 for j in org if j in i]
  198. PErsons = [i for i in main1 for j in per if j in i]
  199. location = [i for i in main1 for j in loc if j in i]
  200. except IndexError:
  201. pass
  202. # ************************************* ORGANIZATION ********************************************************************
  203. def organisation():
  204. print('organisation working ')
  205. try:
  206. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  207. '').replace(
  208. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  209. '').replace(
  210. '.com', ''))) < 4:
  211. pass
  212. else:
  213. match = str(urlfinal[0]).lower()
  214. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  215. 'https',
  216. '').replace(
  217. 'http', '').replace(":", "").replace("/", "").upper()
  218. print(match)
  219. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  220. '') + " /" + \
  221. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  222. s1 = s1g.upper()
  223. s2 = match.upper()
  224. from difflib import SequenceMatcher
  225. print(s1)
  226. print(s2)
  227. print(SequenceMatcher(None, s1, s2).ratio())
  228. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  229. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  230. final.append(
  231. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  232. '').replace(
  233. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  234. '').replace(
  235. '.com',
  236. '').replace(']', ''))
  237. else:
  238. final.append("OrganizationName--" + s2)
  239. except IndexError:
  240. try:
  241. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  242. '').replace(
  243. '"',
  244. '').replace(
  245. '.com', '').replace('.in', ''))) < 4:
  246. pass
  247. else:
  248. match = str(urlfinal[0]).lower()
  249. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  250. '').replace(
  251. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  252. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  253. s1 = s1g.upper()
  254. s2 = match.upper()
  255. from difflib import SequenceMatcher
  256. print(s1)
  257. print(s2)
  258. print(SequenceMatcher(None, s1, s2).ratio())
  259. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  260. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  261. final.append(
  262. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
  263. '[',
  264. '').replace(
  265. ']', '').replace(
  266. '.com', ''))
  267. else:
  268. final.append("OrganizationName--" + s2)
  269. except IndexError:
  270. try:
  271. match = str(urlfinal[0]).lower()
  272. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  273. '').upper()
  274. final.append("OrganizationName--" + match)
  275. # remove_list.append(match)
  276. except IndexError:
  277. company()
  278. #################################################company Name########################################
  279. def company():
  280. print('company list working')
  281. import re
  282. new = []
  283. with open('test.txt', 'r+') as f:
  284. flag = False
  285. for line in f:
  286. line = line.upper()
  287. matches = re.findall(
  288. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  289. line)
  290. for i in matches:
  291. if i in line:
  292. flag = True
  293. if flag:
  294. o = "OrganizationName--" + line
  295. new.append(o)
  296. # if line.startswith('\n'):
  297. # flag = False
  298. try:
  299. a = new[0].replace('\n', '')
  300. final.append(a)
  301. except IndexError:
  302. final.append("OrganizationName--")
  303. # ************************************* CONTACT PERSON *******************************************************************
  304. def contactpersonname():
  305. print('contactpersonname working')
  306. try:
  307. final.append(
  308. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
  309. "]",
  310. "") + '/' +
  311. PErsons[
  312. 1].replace(":PER", "").replace('"', ''))
  313. except IndexError:
  314. try:
  315. final.append(
  316. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
  317. "").replace(
  318. '"', ''))
  319. except IndexError:
  320. final.append("CONTACTPERSONNAME--")
  321. def image_to_text():
  322. # doc = DocumentFile.from_images(found)
  323. # result = model(doc)
  324. # image_to_text.txt = result.render()
  325. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  326. # img = Image.open(found)
  327. # text = tess.image_to_string(img)
  328. # image_to_text.txt = text
  329. # print(text)
  330. import cv2
  331. img_path = found
  332. img = cv2.imread(img_path)
  333. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  334. cv2.imwrite(str(found), img)
  335. result = ocr.ocr(img_path, cls=True)
  336. result = result[0]
  337. txts = [line[1][0] for line in result]
  338. image_to_text.txt = ""
  339. for i in txts:
  340. if len(i) < 4:
  341. continue
  342. # print(i+"\n")
  343. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  344. # print(image_to_text.txt)
  345. def pdf_to_text():
  346. from pdfminer.high_level import extract_text
  347. pdf_to_text.txt = extract_text(found)
  348. # pdf_to_text.txt= text.replace('\n', ' ')
  349. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  350. if extension in extensionlist:
  351. print('image' + extension)
  352. image_to_text()
  353. x = image_to_text.txt
  354. else:
  355. print('pdf' + extension)
  356. pdf_to_text()
  357. x = pdf_to_text.txt
  358. verticaltext = x
  359. htext = x
  360. # print('------------------------------------------------')
  361. #print('############################################################# this is verticaltext #################################################################')
  362. print(verticaltext)
  363. htext = htext.replace('\n', ' ')
  364. # print('############################################################# this is htext #############################################################')
  365. #print(htext)
  366. y = x.replace('\n', ',')
  367. y = y.replace(' ', ' ')
  368. # y = y.replace(".", " .")
  369. horizontaltext = y
  370. # print('------------------------------------------------')
  371. #print('############################################################# this is horizontaltext #############################################################')
  372. #print(horizontaltext)
  373. textfile = open("test123456.txt", "w")
  374. a = textfile.write(verticaltext)
  375. textfile.close()
  376. textfile = open("vtext.txt", "w")
  377. a = textfile.write(horizontaltext)
  378. textfile.close()
  379. with open('test123456.txt', 'r') as f:
  380. with open('test.txt', 'w') as w:
  381. for line in f:
  382. if line.strip().replace('|', ''):
  383. w.write(line)
  384. ###########################ADDRESS##################################
  385. addrespinlst = []
  386. def splitaddress():
  387. import re
  388. textaddress = htext.replace('\n', ' ')
  389. # print(textaddress)
  390. address1 = (textaddress.partition(",")[0])
  391. words = address1.split()
  392. address1 = words[-1]
  393. addre = (htext.partition(",")[2])
  394. a = addre.replace('\n', ' ').replace('\x0c', '')
  395. addre = (a.partition(",")[2])
  396. matches = re.findall(
  397. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  398. a)
  399. for match in matches:
  400. address2 = match
  401. address2 = str(address2)
  402. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
  403. '')
  404. matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  405. for address3 in matches:
  406. pass
  407. try:
  408. Address = address1 + "," + address2 + "," + address3
  409. final.append('ADDRESS--' + Address)
  410. addrespinlst.append(Address)
  411. except NameError:
  412. final.append('ADDRESS--')
  413. #print('############################################################ Addressmodelworking #############################################################')
  414. # doc = nlp_model1(textaddress)
  415. # addlist = []
  416. # for ent in doc.ents:
  417. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  418. # addlist.append(name)
  419. # try:
  420. # Address = addlist[0]
  421. # final.append(Address)
  422. # addrespinlst.append(Address)
  423. # remove_list.append(
  424. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  425. # "ADDRESS--",
  426. # ""))
  427. # except IndexError:
  428. # final.append("ADDRESS--")
  429. pass
  430. ################################################## website#######################################################
  431. # import re
  432. # url = []
  433. # matches = re.findall(r'www.*', verticaltext)
  434. # for match in matches:
  435. # if (match.count('.')) == 1:
  436. # a_string1 = match.replace("www", "www.")
  437. # final.append("Urls--" + a_string1)
  438. # url.append(a_string1)
  439. # else:
  440. # final.append("Urls--" + match)
  441. # if len(url)==0:
  442. # from urlextract import URLExtract
  443. # extractor = URLExtract()
  444. # urls = extractor.find_urls(verticaltext)
  445. # try:
  446. # urllist = urls[0]
  447. # final.append("Urls--"+urllist)
  448. # url.append(urllist)
  449. # except IndexError:
  450. # final.append("Urls--")
  451. # for match in matches:
  452. # if (match.count('.')) == 1:
  453. # a_string1 = match.replace("www", "www.")
  454. # final.append("Urls--" + a_string1)
  455. # url.append(a_string1)
  456. # else:
  457. # final.append("Urls--" + match)
  458. # url.append(match)
  459. # remove_list.append(match)
  460. # else:
  461. # final.append("Urls--" )
  462. ################################################## website#######################################################
  463. import re
  464. # final=[]
  465. url = []
  466. urlfinal = []
  467. matches = re.findall(r'www.*', verticaltext)
  468. for match in matches:
  469. if (match.count('.')) == 1:
  470. a_string1 = match.replace("www", "www.")
  471. # final.append("Urls--" + a_string1)
  472. url.append(a_string1)
  473. else:
  474. url.append(match)
  475. if len(url) == 0:
  476. from urlextract import URLExtract
  477. extractor = URLExtract()
  478. urls = extractor.find_urls(verticaltext)
  479. try:
  480. urllist = urls[0]
  481. url.append(urllist)
  482. url.append(urllist)
  483. except IndexError:
  484. pass
  485. for match in matches:
  486. if (match.count('.')) == 1:
  487. a_string1 = match.replace("www", "www.")
  488. url.append(a_string1)
  489. # url.append(a_string1)
  490. else:
  491. url.append(match)
  492. url.append(match)
  493. else:
  494. pass
  495. try:
  496. test_string = url[0]
  497. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  498. res = [ele for ele in test_list if (ele in test_string)]
  499. if len(res) == 0:
  500. print('no match')
  501. final.append('urls--')
  502. else:
  503. print('matched')
  504. final.append('urls--' + url[0])
  505. urlfinal.append(url[0])
  506. except IndexError:
  507. final.append('urls--')
  508. print(
  509. '############################################################# url #############################################################')
  510. print(url)
  511. #######organisation and contact################
  512. # def company_url():
  513. # # print('--url--')
  514. # # print(url)
  515. # try:
  516. # match = str(url[0]).lower()
  517. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  518. # final.append("OrganizationName--" + match)
  519. # # remove_list.append(match)
  520. # except IndexError:
  521. # org_name()
  522. # organisation()
  523. # final.append("OrganizationName--")
  524. # make example sentence
  525. # print(horizontaltext)
  526. sentence = Sentence(verticaltext)
  527. # predict NER tags
  528. tagger.predict(sentence)
  529. # print sentence
  530. ko = (sentence)
  531. ko1 = str(ko).split("→")
  532. import pandas as pd
  533. dfg = []
  534. try:
  535. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  536. except IndexError:
  537. os.remove(found)
  538. return 'Invalid image'
  539. dfg.append(s)
  540. df = pd.DataFrame(dfg)
  541. df = df[0]
  542. df.to_csv("df.csv", index=False)
  543. df1 = pd.read_csv("df.csv")
  544. ve = df1["0"].str.split(",")
  545. fgf = ve.to_list()
  546. dfgh = pd.DataFrame(fgf[0])
  547. maindf = dfgh[0] # .str.split(":")
  548. # maindf.to_csv("main.csv")
  549. main1 = maindf.to_list()
  550. main1
  551. # cv=pd.DataFrame(ve)
  552. # cv
  553. per = ["PER"]
  554. org = ["ORG"]
  555. loc = ["LOC"]
  556. organizations = [i for i in main1 for j in org if j in i]
  557. PErsons = [i for i in main1 for j in per if j in i]
  558. location = [i for i in main1 for j in loc if j in i]
  559. # ************************************* ORGANIZATION ********************************************************************
  560. try:
  561. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  562. '').replace(
  563. ']', '').replace(
  564. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  565. pass
  566. # company_url()
  567. else:
  568. match = str(urlfinal[0]).lower()
  569. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  570. 'https',
  571. '').replace(
  572. 'http', '').replace(":", "").replace("/", "").upper()
  573. print(match)
  574. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  575. '.com', '') + " /" + \
  576. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  577. s1 = s1g.upper()
  578. s2 = match.upper()
  579. from difflib import SequenceMatcher
  580. print(s1)
  581. print(s2)
  582. print(SequenceMatcher(None, s1, s2).ratio())
  583. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  584. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  585. final.append(
  586. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  587. '').replace(
  588. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  589. '').replace(
  590. '.com', '').replace(']', ''))
  591. else:
  592. final.append("OrganizationName--" + s2)
  593. except IndexError:
  594. try:
  595. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  596. '').replace(
  597. '"',
  598. '').replace(
  599. '.com', ''))) < 4:
  600. pass
  601. # company_url()
  602. else:
  603. match = str(urlfinal[0]).lower()
  604. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  605. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  606. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
  607. '').replace(
  608. '.com', '')
  609. s1 = s1g.upper()
  610. s2 = match.upper()
  611. from difflib import SequenceMatcher
  612. print(s1)
  613. print(s2)
  614. print(SequenceMatcher(None, s1, s2).ratio())
  615. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  616. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  617. final.append(
  618. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  619. '').replace(
  620. ']', '').replace(
  621. '.com', '').replace(']', ''))
  622. else:
  623. final.append("OrganizationName--" + s2)
  624. except IndexError:
  625. org_name()
  626. organisation()
  627. # final.append("OrganizationName--")
  628. # ************************************* CONTACT PERSON *******************************************************************
  629. try:
  630. final.append(
  631. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  632. "") +
  633. PErsons[
  634. 1].replace(":PER", "").replace('"', ''))
  635. except IndexError:
  636. try:
  637. final.append(
  638. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  639. '"',
  640. ''))
  641. except IndexError:
  642. org_name()
  643. contactpersonname()
  644. # final.append("CONTACTPERSONNAME--")
  645. ###############address flair#####################
  646. try:
  647. print(
  648. '############################################################# address new code #############################################################')
  649. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  650. loclst = [i for i in loactionlst if i in htext.lower()]
  651. textaddress = htext
  652. textaddress = textaddress.replace("|", ",")
  653. textaddress = textaddress.lower()
  654. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  655. grop = nlp(textaddress)
  656. citycountry = []
  657. print('########################### city or country name ###########################')
  658. d = grop[-1]
  659. if d['entity_group'] == "COUNTRY":
  660. print(d["word"])
  661. citycountry.append(d["word"])
  662. elif d['entity_group'] == "CITY":
  663. print(d["word"])
  664. citycountry.append(d["word"])
  665. try:
  666. address1 = loclst[0]
  667. except IndexError:
  668. address1 = (textaddress.partition(",")[0])
  669. words = address1.split()
  670. address1 = words[-1]
  671. star_location = address1.lower()
  672. end_location = citycountry[0].replace("#", "")
  673. start = star_location
  674. end = end_location
  675. s = textaddress.lower()
  676. middle_address = (s.split(start))[-1].split(end)[0]
  677. Address = start + middle_address + end
  678. Address = Address.replace('--', '').title()
  679. print(Address)
  680. if Address.count(',') < 2:
  681. splitaddress()
  682. else:
  683. final.append('ADDRESS--' + Address)
  684. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  685. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  686. # d1 = star_location.split()
  687. # d2 = end_location.split()
  688. # d3 = d1[0]
  689. # d4 = d2[0]
  690. # start = d3
  691. # end = d4
  692. # s = horizontaltext
  693. # middle_address = ((s.split(start))[1].split(end)[0])
  694. # Address = d3 + middle_address + d4
  695. # final.append('ADDRESS--' + Address)
  696. # addrespinlst.append(Address)
  697. except IndexError:
  698. splitaddress()
  699. ########################################## Designation ###########################################
  700. import re
  701. new = []
  702. with open('test.txt', 'r') as f:
  703. flag = False
  704. for line in f:
  705. line1 = line
  706. line = line.upper()
  707. matches = re.findall(
  708. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  709. line)
  710. for match in matches:
  711. line = line.replace('-', '')
  712. # print(line)
  713. o = "Designation--" + line
  714. new.append(o)
  715. remove_list.append(str(line1).replace('\n', ''))
  716. try:
  717. a = new[0].replace('\n', '')
  718. final.append(a)
  719. except IndexError:
  720. final.append("Designation--")
  721. ###################################################Phone number#################################################
  722. num = []
  723. import phonenumbers
  724. # print(verticaltext)
  725. numbers = phonenumbers.PhoneNumberMatcher(
  726. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
  727. for number in numbers:
  728. number = str(number).split(")")
  729. num.append(number[1])
  730. # num.append(number[-1])
  731. print(num)
  732. import re
  733. # Input list of strings
  734. # num =[' 7227906777Extn1204634444']
  735. # Define a regular expression pattern to split when text is present
  736. pattern = r'[a-zA-Z]+'
  737. # Function to split a string based on the pattern
  738. def split_string(text):
  739. return re.split(pattern, text)
  740. # Process each line in the list
  741. split_lines = [split_string(line) for line in num]
  742. # Flatten the list of lists into a single list
  743. split_lines = [item for sublist in split_lines for item in sublist]
  744. # Remove any empty strings
  745. num = [item for item in split_lines if item]
  746. # Print the split lines
  747. print(num)
  748. if len(num) == 0:
  749. final.append("ContactNumber--")
  750. final.append("OrganizationNumber--")
  751. elif len(num) > 1:
  752. final.append("ContactNumber--" + num[0].replace(' ', ''))
  753. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  754. elif len(num) == 1:
  755. try:
  756. final.append("ContactNumber--" + num[0].replace(' ', ''))
  757. final.append("OrganizationNumber--")
  758. except IndexError:
  759. final.append("ContactNumber--")
  760. final.append("OrganizationNumber--")
  761. print(
  762. '############################################################# num #############################################################')
  763. print(num)
  764. # try:
  765. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  766. # remove_list.append(num[0])
  767. # except IndexError:
  768. # pass
  769. # try:
  770. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  771. # remove_list.append(num[1])
  772. # except IndexError:
  773. # pass
  774. # try:
  775. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  776. # remove_list.append(num[2])
  777. # except IndexError:
  778. # pass
  779. ################################################### Email######################################################
  780. import re
  781. from email_scraper import scrape_emails
  782. s = list(scrape_emails(horizontaltext))
  783. email_id = s
  784. # email_id = []
  785. # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
  786. # for match in matches:
  787. # email_id.append(match)
  788. # # final.append('Email--' + match)
  789. # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
  790. # # final.append(email_)
  791. # # final.append('Email--' + email_)
  792. # # remove_list.append(email_)
  793. if len(email_id) > 1:
  794. final.append(
  795. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  796. ""))
  797. final.append(
  798. 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  799. "'",
  800. ""))
  801. else:
  802. try:
  803. final.append(
  804. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  805. "'",
  806. ""))
  807. final.append('OrganizationEmail--')
  808. except IndexError:
  809. final.append('ContactEmail--')
  810. final.append('OrganizationEmail--')
  811. ###############PINCODE############
  812. pinlst = []
  813. print(addrespinlst)
  814. import pgeocode
  815. # try:
  816. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  817. # for i in matche1:
  818. # address3 = i.replace(' ', '').replace('-', '')
  819. # pinlst.append(address3)
  820. # except IndexError:
  821. lst = []
  822. for i in num:
  823. i = i[1:]
  824. lst.append(i)
  825. infile = r"vtext.txt"
  826. outfile = r"cleaned_file.txt"
  827. import glob
  828. delete_list = lst
  829. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  830. fin = open(infile, "r+")
  831. fout = open(outfile, "w+")
  832. for line12 in fin:
  833. for word in delete_list:
  834. line12 = line12.replace(word, "")
  835. fout.write(line12)
  836. fin.close()
  837. # print(line)
  838. # print(addrespinlst)
  839. import pgeocode
  840. #print(line12)
  841. import re
  842. matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  843. for i in matche1:
  844. address3 = i.replace(' ', '').replace('-', '')
  845. pinlst.append(address3)
  846. nomi = pgeocode.Nominatim('IN')
  847. try:
  848. a = nomi.query_postal_code(str(pinlst[-1]))
  849. # print(a)
  850. b = a.keys()
  851. c = b.values.tolist()
  852. d = a.tolist()
  853. postal_code = "PinCode1" + "--" + d[0]
  854. final.append(postal_code)
  855. country_code = c[1] + "--" + str(d[1])
  856. final.append(country_code)
  857. place_name = 'LandMark1' + "--" + str(d[2])
  858. final.append(place_name)
  859. state_name = c[3] + "--" + str(d[3])
  860. final.append(state_name)
  861. state_code = c[4] + "--" + str(d[4])
  862. final.append(state_code)
  863. county_name = 'CityName1' + "--" + str(d[5])
  864. final.append(county_name)
  865. except (IndexError, NameError):
  866. final.append("PinCode1--"+" ")
  867. final.append("country_code--")
  868. final.append("LandMark1--")
  869. final.append("state_name--")
  870. final.append("state_code--")
  871. final.append("CityName1--")
  872. ######################################################## json #####################################################################
  873. import pandas as pd
  874. df = pd.DataFrame(final)
  875. df1 = df[0].str.split('--', expand=True)
  876. # print(df1)
  877. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  878. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  879. df1['Keys'] = df1['Keys'].str.strip()
  880. df1.to_csv('path123.csv', index=False)
  881. df2 = pd.read_csv('path123.csv')
  882. print(df2)
  883. if df2['Values'].isnull().all():
  884. print("Column 'Column2' is empty.")
  885. return 'Invalid image'
  886. else:
  887. pass
  888. df2 = df2.T
  889. df2.to_csv('path1.csv', index=False, header=False)
  890. df1 = pd.read_csv('path1.csv')
  891. df1.to_json('firstjson1.json', orient="index")
  892. import json
  893. with open('firstjson1.json', 'r') as json_file:
  894. json_load = json.load(json_file)
  895. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  896. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  897. # # print('--------------------------------------------------------------------------')
  898. # # print(nothing)
  899. empty = []
  900. import base64
  901. name = found
  902. image = open(name, 'rb')
  903. image_read = image.read()
  904. image_64_encode = base64.b64encode(image_read)
  905. NULL = 'null'
  906. empty.append("ByteData--" + (NULL).strip('""'))
  907. image_64_encode = image_64_encode.decode('utf-8')
  908. empty.append("FileData--" + str(image_64_encode))
  909. imagedata = name.split("/")
  910. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  911. imagename1 = str(imagename).split('.')
  912. imagename = str(imagename1[-2]).replace("[", "]")
  913. empty.append("FileName--" + imagename)
  914. empty.append("FilePath--"+ "")
  915. imageExtension = str(imagename1[-1]).replace("[", "]")
  916. empty.append("FileType--" + imageExtension)
  917. image.close()
  918. import pandas as pd
  919. df = pd.DataFrame(empty)
  920. df = df[0].str.split("--", expand=True)
  921. data1 = pd.DataFrame(df[0])
  922. data2 = pd.DataFrame(df[1])
  923. dt = data2.set_index(data1[0])
  924. dt4 = dt.T
  925. dictionary = dt4.to_dict(orient="index")
  926. list1 = []
  927. # list.append(a)
  928. list1.append(dictionary[1])
  929. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  930. print('--------------------')
  931. # print(namelist)
  932. import json
  933. # JSON data:
  934. x = nothing
  935. # python object to be appended
  936. y = {"image": dictionary[1]}
  937. # parsing JSON string:
  938. z = json.loads(x)
  939. # appending the data
  940. z.update(y)
  941. # the result is a JSON string:
  942. # print(json.dumps(z))
  943. zlist.append(z)
  944. #############################################creating csv#####################################
  945. # print(final)
  946. #print(imagelist)
  947. #final.append('image--' + str(imagelist))
  948. # import requests
  949. # import json
  950. # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
  951. # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing
  952. # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
  953. # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
  954. # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  955. # payload1 = json.dumps(zlist)
  956. # # print('--------------------------------------------------------------------------')
  957. # #print(payload1)
  958. # headers = {
  959. # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
  960. # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
  961. # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
  962. # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  963. # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
  964. # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af',
  965. # 'Content-Type': 'application/json'
  966. # }
  967. # response = requests.request("POST", url, headers=headers, data=payload1)
  968. # # print("##############################################################")
  969. # print(payload1)
  970. # #print(zlist)
  971. # # import os
  972. # # if 'BusinessCards Created Successfully' in response.text:
  973. # # print('present')
  974. # # os.remove(found)
  975. # # else:
  976. # # print('not present')
  977. # df1.to_json('visitingcard.json')
  978. # data = df1.to_json('visiting.json', orient='records')
  979. # print(data)
  980. #return render_template('index.html')
  981. #return response.text
  982. #return z
  983. return zlist
  984. if __name__ == "__main__":
  985. app.run(host='0.0.0.0', port=1112)