Няма описание
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Business_cards.py 45KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121
  1. from flask import Flask, render_template, request, redirect, Response, send_file
  2. import os
  3. import openai
  4. import requests
  5. import pandas as pd
  6. import pgeocode
  7. from email_scraper import scrape_emails
  8. import phonenumbers
  9. from pdfminer.high_level import extract_text
  10. import pytesseract
  11. import time
  12. import multiprocessing
  13. from PIL import Image
  14. from functools import partial
  15. from urlextract import URLExtract
  16. import pytesseract as tess
  17. from PIL import Image
  18. # from doctr.io import DocumentFile
  19. # from doctr.models import ocr_predictor
  20. # model = ocr_predictor(pretrained=True)
  21. # load tagger
  22. ######################################################
  23. import os
  24. import glob
  25. from pytesseract import *
  26. import shutil
  27. import cv2
  28. import matplotlib
  29. from werkzeug.utils import secure_filename
  30. import requests
  31. import spacy
  32. import time
  33. import multiprocessing
  34. from PIL import Image
  35. from functools import partial
  36. nlp_model = spacy.load("D:/projects/C01app/Resume_parser/ME")
  37. nlp_model1 = spacy.load("D:/projects/C01app/Resume_parser/bdeeducation_50_0.2")
  38. from flask import Flask, render_template, request, redirect, Response, send_file
  39. import pandas as pd
  40. ################################################################
  41. Current_Working_Directory=os.getcwd()
  42. Current_Working_Directory=Current_Working_Directory.replace("\\","/")
  43. nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")
  44. ################################################################
  45. # import spacy
  46. # nlp_model1 = spacy.load('./ADD3001.2')
  47. from flair.data import Sentence
  48. from flair.models import SequenceTagger
  49. from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
  50. tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  51. model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  52. from paddleocr import PaddleOCR, draw_ocr
  53. ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True)
  54. tagger = SequenceTagger.load("flair/ner-english-large")
  55. import datetime
  56. app = Flask(__name__)
  57. # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
  58. @app.route('/', methods=['GET'])
  59. def card():
  60. return render_template('card.html')
  61. @app.route('/upload_BusinessCards', methods=["POST"])
  62. # @app.route('/multiplecards', methods=["POST"])
  63. def multiplecards():
  64. # print('################## multiple card detection #######################')
  65. # print(Dataset)
  66. datalist=[]
  67. Dataset = request.get_json()
  68. # print(data)
  69. #datalist.append(Dataset)
  70. data = {'visiting': Dataset}
  71. for i in data['visiting']:
  72. import time
  73. # time.sleep(1)
  74. a = i
  75. x = a['FileData']
  76. # print(x)
  77. y = a['FileName']
  78. z = a['FileType']
  79. # CreatedBy=a['CreatedBy']
  80. name = y + '.' + z
  81. # print(name)
  82. # print(y)
  83. # image = y.split("/")
  84. # filename=image[-1]
  85. # print(x)
  86. img_data = x.encode()
  87. import base64
  88. with open('./multicards/' + name, "wb") as fh:
  89. fh.write(base64.decodebytes(img_data))
  90. # print(i)
  91. # import os
  92. # import glob
  93. # for i in glob.glob('./multipleupload/*'):
  94. found = './multicards/' + name
  95. print(found)
  96. extension = found.split('.')[-1]
  97. # for root, dirs, fils in os.glob('./multipleupload'):
  98. # for name in files:
  99. # foundfile= os.path.join(root, name)
  100. # print(foundfile)
  101. import re
  102. import csv
  103. import glob
  104. import os
  105. # import pytesseract
  106. # import cv2
  107. import numpy as np
  108. import glob
  109. import os
  110. import cv2
  111. import requests
  112. final = []
  113. # final.append('assignto--'+CreatedBy)
  114. imagelist = []
  115. # print(found)
  116. remove_list = []
  117. import os
  118. import glob
  119. import pdfminer
  120. # import os
  121. # ts = 0
  122. # for file_name in glob.glob('./upload/*'):
  123. # fts = os.path.getmtime(file_name)
  124. # if fts > ts:
  125. # ts = fts
  126. # found = file_name
  127. # print(found)
  128. # print(extension)
  129. def org_name():
  130. print('org_name is working')
  131. import pytesseract
  132. fname = found
  133. if extension != 'pdf':
  134. img = cv2.imread(fname)
  135. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  136. cv2.imwrite(str(found), img)
  137. from PIL import Image
  138. im = Image.open(found)
  139. im.save("images1.png", dpi=(1200, 1200))
  140. # import pytesseract
  141. fname = "images1.png"
  142. import pytesseract as tess
  143. from PIL import Image
  144. tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
  145. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  146. with open("demo.pdf", "w+b", ) as f:
  147. f.write(pdf)
  148. from pdfminer.high_level import extract_text
  149. text = extract_text('demo.pdf')
  150. # doc = DocumentFile.from_images(found)
  151. # result = model(doc)
  152. # text = result.render()
  153. # from pdfminer.high_level import extract_text
  154. # txt = extract_text('demo.pdf')
  155. else:
  156. from pdfminer.high_level import extract_text
  157. text = extract_text(fname)
  158. sentence = Sentence(text)
  159. # predict NER tags
  160. tagger.predict(sentence)
  161. # print sentence
  162. ko = (sentence)
  163. ko1 = str(ko).split("→")
  164. import pandas as pd
  165. dfg = []
  166. try:
  167. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  168. # os.remove(found)
  169. # return 'Invalid image'
  170. dfg.append(s)
  171. df = pd.DataFrame(dfg)
  172. df = df[0]
  173. df.to_csv("df.csv", index=False)
  174. df1 = pd.read_csv("df.csv")
  175. ve = df1["0"].str.split(",")
  176. fgf = ve.to_list()
  177. dfgh = pd.DataFrame(fgf[0])
  178. maindf = dfgh[0] # .str.split(":")
  179. # maindf.to_csv("main.csv")
  180. main1 = maindf.to_list()
  181. main1
  182. # cv=pd.DataFrame(ve)
  183. # cv
  184. per = ["PER"]
  185. org = ["ORG"]
  186. loc = ["LOC"]
  187. organizations = [i for i in main1 for j in org if j in i]
  188. PErsons = [i for i in main1 for j in per if j in i]
  189. location = [i for i in main1 for j in loc if j in i]
  190. except IndexError:
  191. pass
  192. # ************************************* ORGANIZATION ********************************************************************
  193. def organisation():
  194. print('organisation working ')
  195. try:
  196. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  197. '').replace(
  198. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  199. '').replace(
  200. '.com', ''))) < 4:
  201. pass
  202. else:
  203. match = str(urlfinal[0]).lower()
  204. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  205. 'https',
  206. '').replace(
  207. 'http', '').replace(":", "").replace("/", "").upper()
  208. print(match)
  209. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  210. '') + " /" + \
  211. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  212. s1 = s1g.upper()
  213. s2 = match.upper()
  214. from difflib import SequenceMatcher
  215. print(s1)
  216. print(s2)
  217. print(SequenceMatcher(None, s1, s2).ratio())
  218. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  219. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  220. final.append(
  221. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  222. '').replace(
  223. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  224. '').replace(
  225. '.com',
  226. '').replace(']', ''))
  227. else:
  228. final.append("OrganizationName--" + s2)
  229. except IndexError:
  230. try:
  231. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  232. '').replace(
  233. '"',
  234. '').replace(
  235. '.com', '').replace('.in', ''))) < 4:
  236. pass
  237. else:
  238. match = str(urlfinal[0]).lower()
  239. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  240. '').replace(
  241. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  242. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  243. s1 = s1g.upper()
  244. s2 = match.upper()
  245. from difflib import SequenceMatcher
  246. print(s1)
  247. print(s2)
  248. print(SequenceMatcher(None, s1, s2).ratio())
  249. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  250. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  251. final.append(
  252. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
  253. '[',
  254. '').replace(
  255. ']', '').replace(
  256. '.com', ''))
  257. else:
  258. final.append("OrganizationName--" + s2)
  259. except IndexError:
  260. try:
  261. match = str(urlfinal[0]).lower()
  262. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  263. '').upper()
  264. final.append("OrganizationName--" + match)
  265. # remove_list.append(match)
  266. except IndexError:
  267. company()
  268. #################################################company Name########################################
  269. def company():
  270. print('company list working')
  271. import re
  272. new = []
  273. with open('test.txt', 'r+') as f:
  274. flag = False
  275. for line in f:
  276. line = line.upper()
  277. matches = re.findall(
  278. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  279. line)
  280. for i in matches:
  281. if i in line:
  282. flag = True
  283. if flag:
  284. o = "OrganizationName--" + line
  285. new.append(o)
  286. # if line.startswith('\n'):
  287. # flag = False
  288. try:
  289. a = new[0].replace('\n', '')
  290. final.append(a)
  291. except IndexError:
  292. final.append("OrganizationName--")
  293. # ************************************* CONTACT PERSON *******************************************************************
  294. def contactpersonname():
  295. print('contactpersonname working')
  296. try:
  297. final.append(
  298. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
  299. "]",
  300. "") + '/' +
  301. PErsons[
  302. 1].replace(":PER", "").replace('"', ''))
  303. except IndexError:
  304. try:
  305. final.append(
  306. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
  307. "").replace(
  308. '"', ''))
  309. except IndexError:
  310. final.append("CONTACTPERSONNAME--")
  311. def image_to_text():
  312. # doc = DocumentFile.from_images(found)
  313. # result = model(doc)
  314. # image_to_text.txt = result.render()
  315. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  316. # img = Image.open(found)
  317. # text = tess.image_to_string(img)
  318. # image_to_text.txt = text
  319. # print(text)
  320. import cv2
  321. img_path = found
  322. img = cv2.imread(img_path)
  323. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  324. cv2.imwrite(str(found), img)
  325. result = ocr.ocr(img_path, cls=True)
  326. result = result[0]
  327. txts = [line[1][0] for line in result]
  328. image_to_text.txt = ""
  329. for i in txts:
  330. if len(i) < 4:
  331. continue
  332. # print(i+"\n")
  333. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  334. # print(image_to_text.txt)
  335. def pdf_to_text():
  336. from pdfminer.high_level import extract_text
  337. pdf_to_text.txt = extract_text(found)
  338. # pdf_to_text.txt= text.replace('\n', ' ')
  339. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  340. if extension in extensionlist:
  341. print('image' + extension)
  342. image_to_text()
  343. x = image_to_text.txt
  344. else:
  345. print('pdf' + extension)
  346. pdf_to_text()
  347. x = pdf_to_text.txt
  348. verticaltext = x
  349. htext = x
  350. # print('------------------------------------------------')
  351. print(
  352. '############################################################# this is verticaltext #################################################################')
  353. print(verticaltext)
  354. htext = htext.replace('\n', ' ')
  355. print(
  356. '############################################################# this is htext #############################################################')
  357. print(htext)
  358. y = x.replace('\n', ',')
  359. y = y.replace(' ', ' ')
  360. # y = y.replace(".", " .")
  361. horizontaltext = y
  362. # print('------------------------------------------------')
  363. print(
  364. '############################################################# this is horizontaltext #############################################################')
  365. print(horizontaltext)
  366. textfile = open("test123456.txt", "w")
  367. a = textfile.write(verticaltext)
  368. textfile.close()
  369. textfile = open("vtext.txt", "w")
  370. a = textfile.write(horizontaltext)
  371. textfile.close()
  372. with open('test123456.txt', 'r') as f:
  373. with open('test.txt', 'w') as w:
  374. for line in f:
  375. if line.strip().replace('|', ''):
  376. w.write(line)
  377. ###########################ADDRESS##################################
  378. addrespinlst = []
  379. def splitaddress():
  380. import re
  381. textaddress = htext.replace('\n', ' ')
  382. # print(textaddress)
  383. address1 = (textaddress.partition(",")[0])
  384. words = address1.split()
  385. address1 = words[-1]
  386. addre = (htext.partition(",")[2])
  387. a = addre.replace('\n', ' ').replace('\x0c', '')
  388. addre = (a.partition(",")[2])
  389. matches = re.findall(
  390. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  391. a)
  392. for match in matches:
  393. address2 = match
  394. address2 = str(address2)
  395. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
  396. '')
  397. matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  398. for address3 in matches:
  399. pass
  400. try:
  401. Address = address1 + "," + address2 + "," + address3
  402. final.append('ADDRESS--' + Address)
  403. addrespinlst.append(Address)
  404. except NameError:
  405. print(
  406. '############################################################ Addressmodelworking #############################################################')
  407. # doc = nlp_model1(textaddress)
  408. # addlist = []
  409. # for ent in doc.ents:
  410. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  411. # addlist.append(name)
  412. # try:
  413. # Address = addlist[0]
  414. # final.append(Address)
  415. # addrespinlst.append(Address)
  416. # remove_list.append(
  417. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  418. # "ADDRESS--",
  419. # ""))
  420. # except IndexError:
  421. # final.append("ADDRESS--")
  422. pass
  423. ################################################## website#######################################################
  424. # import re
  425. # url = []
  426. # matches = re.findall(r'www.*', verticaltext)
  427. # for match in matches:
  428. # if (match.count('.')) == 1:
  429. # a_string1 = match.replace("www", "www.")
  430. # final.append("Urls--" + a_string1)
  431. # url.append(a_string1)
  432. # else:
  433. # final.append("Urls--" + match)
  434. # if len(url)==0:
  435. # from urlextract import URLExtract
  436. # extractor = URLExtract()
  437. # urls = extractor.find_urls(verticaltext)
  438. # try:
  439. # urllist = urls[0]
  440. # final.append("Urls--"+urllist)
  441. # url.append(urllist)
  442. # except IndexError:
  443. # final.append("Urls--")
  444. # for match in matches:
  445. # if (match.count('.')) == 1:
  446. # a_string1 = match.replace("www", "www.")
  447. # final.append("Urls--" + a_string1)
  448. # url.append(a_string1)
  449. # else:
  450. # final.append("Urls--" + match)
  451. # url.append(match)
  452. # remove_list.append(match)
  453. # else:
  454. # final.append("Urls--" )
  455. ################################################## website#######################################################
  456. import re
  457. # final=[]
  458. url = []
  459. urlfinal = []
  460. matches = re.findall(r'www.*', verticaltext)
  461. for match in matches:
  462. if (match.count('.')) == 1:
  463. a_string1 = match.replace("www", "www.")
  464. # final.append("Urls--" + a_string1)
  465. url.append(a_string1)
  466. else:
  467. url.append(match)
  468. if len(url) == 0:
  469. from urlextract import URLExtract
  470. extractor = URLExtract()
  471. urls = extractor.find_urls(verticaltext)
  472. try:
  473. urllist = urls[0]
  474. url.append(urllist)
  475. url.append(urllist)
  476. except IndexError:
  477. pass
  478. for match in matches:
  479. if (match.count('.')) == 1:
  480. a_string1 = match.replace("www", "www.")
  481. url.append(a_string1)
  482. # url.append(a_string1)
  483. else:
  484. url.append(match)
  485. url.append(match)
  486. else:
  487. pass
  488. try:
  489. test_string = url[0]
  490. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  491. res = [ele for ele in test_list if (ele in test_string)]
  492. if len(res) == 0:
  493. print('no match')
  494. final.append('urls--')
  495. else:
  496. print('matched')
  497. final.append('urls--' + url[0])
  498. urlfinal.append(url[0])
  499. except IndexError:
  500. final.append('urls--')
  501. print(
  502. '############################################################# url #############################################################')
  503. print(url)
  504. #######organisation and contact################
  505. # def company_url():
  506. # # print('--url--')
  507. # # print(url)
  508. # try:
  509. # match = str(url[0]).lower()
  510. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  511. # final.append("OrganizationName--" + match)
  512. # # remove_list.append(match)
  513. # except IndexError:
  514. # org_name()
  515. # organisation()
  516. # final.append("OrganizationName--")
  517. # make example sentence
  518. # print(horizontaltext)
  519. sentence = Sentence(verticaltext)
  520. # predict NER tags
  521. tagger.predict(sentence)
  522. # print sentence
  523. ko = (sentence)
  524. ko1 = str(ko).split("→")
  525. import pandas as pd
  526. dfg = []
  527. try:
  528. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  529. except IndexError:
  530. os.remove(found)
  531. return 'Invalid image'
  532. dfg.append(s)
  533. df = pd.DataFrame(dfg)
  534. df = df[0]
  535. df.to_csv("df.csv", index=False)
  536. df1 = pd.read_csv("df.csv")
  537. ve = df1["0"].str.split(",")
  538. fgf = ve.to_list()
  539. dfgh = pd.DataFrame(fgf[0])
  540. maindf = dfgh[0] # .str.split(":")
  541. # maindf.to_csv("main.csv")
  542. main1 = maindf.to_list()
  543. main1
  544. # cv=pd.DataFrame(ve)
  545. # cv
  546. per = ["PER"]
  547. org = ["ORG"]
  548. loc = ["LOC"]
  549. organizations = [i for i in main1 for j in org if j in i]
  550. PErsons = [i for i in main1 for j in per if j in i]
  551. location = [i for i in main1 for j in loc if j in i]
  552. # ************************************* ORGANIZATION ********************************************************************
  553. try:
  554. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  555. '').replace(
  556. ']', '').replace(
  557. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  558. pass
  559. # company_url()
  560. else:
  561. match = str(urlfinal[0]).lower()
  562. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  563. 'https',
  564. '').replace(
  565. 'http', '').replace(":", "").replace("/", "").upper()
  566. print(match)
  567. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  568. '.com', '') + " /" + \
  569. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  570. s1 = s1g.upper()
  571. s2 = match.upper()
  572. from difflib import SequenceMatcher
  573. print(s1)
  574. print(s2)
  575. print(SequenceMatcher(None, s1, s2).ratio())
  576. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  577. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  578. final.append(
  579. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  580. '').replace(
  581. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  582. '').replace(
  583. '.com', '').replace(']', ''))
  584. else:
  585. final.append("OrganizationName--" + s2)
  586. except IndexError:
  587. try:
  588. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  589. '').replace(
  590. '"',
  591. '').replace(
  592. '.com', ''))) < 4:
  593. pass
  594. # company_url()
  595. else:
  596. match = str(urlfinal[0]).lower()
  597. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  598. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  599. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
  600. '').replace(
  601. '.com', '')
  602. s1 = s1g.upper()
  603. s2 = match.upper()
  604. from difflib import SequenceMatcher
  605. print(s1)
  606. print(s2)
  607. print(SequenceMatcher(None, s1, s2).ratio())
  608. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  609. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  610. final.append(
  611. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  612. '').replace(
  613. ']', '').replace(
  614. '.com', '').replace(']', ''))
  615. else:
  616. final.append("OrganizationName--" + s2)
  617. except IndexError:
  618. org_name()
  619. organisation()
  620. # final.append("OrganizationName--")
  621. # ************************************* CONTACT PERSON *******************************************************************
  622. try:
  623. final.append(
  624. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  625. "") +
  626. PErsons[
  627. 1].replace(":PER", "").replace('"', ''))
  628. except IndexError:
  629. try:
  630. final.append(
  631. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  632. '"',
  633. ''))
  634. except IndexError:
  635. org_name()
  636. contactpersonname()
  637. # final.append("CONTACTPERSONNAME--")
  638. ###############address flair#####################
  639. try:
  640. print(
  641. '############################################################# address new code #############################################################')
  642. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  643. loclst = [i for i in loactionlst if i in htext.lower()]
  644. textaddress = htext
  645. textaddress = textaddress.replace("|", ",")
  646. textaddress = textaddress.lower()
  647. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  648. grop = nlp(textaddress)
  649. citycountry = []
  650. print('########################### city or country name ###########################')
  651. d = grop[-1]
  652. if d['entity_group'] == "COUNTRY":
  653. print(d["word"])
  654. citycountry.append(d["word"])
  655. elif d['entity_group'] == "CITY":
  656. print(d["word"])
  657. citycountry.append(d["word"])
  658. try:
  659. address1 = loclst[0]
  660. except IndexError:
  661. address1 = (textaddress.partition(",")[0])
  662. words = address1.split()
  663. address1 = words[-1]
  664. star_location = address1.lower()
  665. end_location = citycountry[0].replace("#", "")
  666. start = star_location
  667. end = end_location
  668. s = textaddress.lower()
  669. middle_address = (s.split(start))[-1].split(end)[0]
  670. Address = start + middle_address + end
  671. Address = Address.replace('--', '').title()
  672. print(Address)
  673. if Address.count(',') < 2:
  674. splitaddress()
  675. else:
  676. final.append('ADDRESS--' + Address)
  677. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  678. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  679. # d1 = star_location.split()
  680. # d2 = end_location.split()
  681. # d3 = d1[0]
  682. # d4 = d2[0]
  683. # start = d3
  684. # end = d4
  685. # s = horizontaltext
  686. # middle_address = ((s.split(start))[1].split(end)[0])
  687. # Address = d3 + middle_address + d4
  688. # final.append('ADDRESS--' + Address)
  689. # addrespinlst.append(Address)
  690. except IndexError:
  691. splitaddress()
  692. ########################################## Designation ###########################################
  693. import re
  694. new = []
  695. with open('test.txt', 'r') as f:
  696. flag = False
  697. for line in f:
  698. line1 = line
  699. line = line.upper()
  700. matches = re.findall(
  701. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  702. line)
  703. for match in matches:
  704. line = line.replace('-', '')
  705. # print(line)
  706. o = "Designation--" + line
  707. new.append(o)
  708. remove_list.append(str(line1).replace('\n', ''))
  709. try:
  710. a = new[0].replace('\n', '')
  711. final.append(a)
  712. except IndexError:
  713. final.append("Designation--")
  714. ###################################################Phone number#################################################
  715. num = []
  716. import phonenumbers
  717. # print(verticaltext)
  718. numbers = phonenumbers.PhoneNumberMatcher(
  719. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
  720. for number in numbers:
  721. number = str(number).split(")")
  722. num.append(number[1])
  723. # num.append(number[-1])
  724. if len(num) == 0:
  725. final.append("ContactNumber--")
  726. final.append("OrganizationNumber--")
  727. elif len(num) > 1:
  728. final.append("ContactNumber--" + num[0].replace(' ', ''))
  729. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  730. elif len(num) == 1:
  731. try:
  732. final.append("ContactNumber--" + num[0].replace(' ', ''))
  733. final.append("OrganizationNumber--")
  734. except IndexError:
  735. final.append("ContactNumber--")
  736. final.append("OrganizationNumber--")
  737. print(
  738. '############################################################# num #############################################################')
  739. print(num)
  740. # try:
  741. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  742. # remove_list.append(num[0])
  743. # except IndexError:
  744. # pass
  745. # try:
  746. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  747. # remove_list.append(num[1])
  748. # except IndexError:
  749. # pass
  750. # try:
  751. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  752. # remove_list.append(num[2])
  753. # except IndexError:
  754. # pass
  755. ################################################### Email######################################################
  756. import re
  757. from email_scraper import scrape_emails
  758. s = list(scrape_emails(horizontaltext))
  759. email_id = s
  760. # email_id = []
  761. # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
  762. # for match in matches:
  763. # email_id.append(match)
  764. # # final.append('Email--' + match)
  765. # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
  766. # # final.append(email_)
  767. # # final.append('Email--' + email_)
  768. # # remove_list.append(email_)
  769. if len(email_id) > 1:
  770. final.append(
  771. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  772. ""))
  773. final.append(
  774. 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  775. "'",
  776. ""))
  777. else:
  778. try:
  779. final.append(
  780. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  781. "'",
  782. ""))
  783. final.append('OrganizationEmail--')
  784. except IndexError:
  785. final.append('ContactEmail--')
  786. final.append('OrganizationEmail--')
  787. ###############PINCODE############
  788. pinlst = []
  789. print(addrespinlst)
  790. import pgeocode
  791. # try:
  792. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  793. # for i in matche1:
  794. # address3 = i.replace(' ', '').replace('-', '')
  795. # pinlst.append(address3)
  796. # except IndexError:
  797. lst = []
  798. for i in num:
  799. i = i[1:]
  800. lst.append(i)
  801. infile = r"vtext.txt"
  802. outfile = r"cleaned_file.txt"
  803. import glob
  804. delete_list = lst
  805. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  806. fin = open(infile, "r+")
  807. fout = open(outfile, "w+")
  808. for line12 in fin:
  809. for word in delete_list:
  810. line12 = line12.replace(word, "")
  811. fout.write(line12)
  812. fin.close()
  813. # print(line)
  814. # print(addrespinlst)
  815. import pgeocode
  816. print(line12)
  817. import re
  818. matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  819. for i in matche1:
  820. address3 = i.replace(' ', '').replace('-', '')
  821. pinlst.append(address3)
  822. nomi = pgeocode.Nominatim('IN')
  823. try:
  824. a = nomi.query_postal_code(str(pinlst[-1]))
  825. # print(a)
  826. b = a.keys()
  827. c = b.values.tolist()
  828. d = a.tolist()
  829. postal_code = "PinCode1" + "--" + d[0]
  830. final.append(postal_code)
  831. country_code = c[1] + "--" + str(d[1])
  832. final.append(country_code)
  833. place_name = 'LandMark1' + "--" + str(d[2])
  834. final.append(place_name)
  835. state_name = c[3] + "--" + str(d[3])
  836. final.append(state_name)
  837. state_code = c[4] + "--" + str(d[4])
  838. final.append(state_code)
  839. county_name = 'CityName1' + "--" + str(d[5])
  840. final.append(county_name)
  841. except (IndexError, NameError):
  842. final.append("PinCode1--")
  843. final.append("country_code--")
  844. final.append("LandMark1--")
  845. final.append("state_name--")
  846. final.append("state_code--")
  847. final.append("CityName1--")
  848. ######################################################## json #####################################################################
  849. import pandas as pd
  850. df = pd.DataFrame(final)
  851. df1 = df[0].str.split('--', expand=True)
  852. # print(df1)
  853. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  854. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  855. df1['Keys'] = df1['Keys'].str.strip()
  856. df1.to_csv('path123.csv', index=False)
  857. df2 = pd.read_csv('path123.csv')
  858. print(df2)
  859. df2 = df2.T
  860. df2.to_csv('path1.csv', index=False, header=False)
  861. df1 = pd.read_csv('path1.csv')
  862. df1.to_json('firstjson1.json', orient="index")
  863. import json
  864. with open('firstjson1.json', 'r') as json_file:
  865. json_load = json.load(json_file)
  866. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  867. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  868. # # print('--------------------------------------------------------------------------')
  869. # # print(nothing)
  870. empty = []
  871. import base64
  872. name = found
  873. image = open(name, 'rb')
  874. image_read = image.read()
  875. image_64_encode = base64.b64encode(image_read)
  876. NULL = 'null'
  877. empty.append("ByteData--" + (NULL).strip('""'))
  878. image_64_encode = image_64_encode.decode('utf-8')
  879. empty.append("FileData--" + str(image_64_encode))
  880. imagedata = name.split("/")
  881. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  882. imagename1 = str(imagename).split('.')
  883. imagename = str(imagename1[-2]).replace("[", "]")
  884. empty.append("FileName--" + imagename)
  885. empty.append("FilePath--"+ "")
  886. imageExtension = str(imagename1[-1]).replace("[", "]")
  887. empty.append("FileType--" + imageExtension)
  888. image.close()
  889. import pandas as pd
  890. df = pd.DataFrame(empty)
  891. df = df[0].str.split("--", expand=True)
  892. data1 = pd.DataFrame(df[0])
  893. data2 = pd.DataFrame(df[1])
  894. dt = data2.set_index(data1[0])
  895. dt4 = dt.T
  896. dictionary = dt4.to_dict(orient="index")
  897. list1 = []
  898. # list.append(a)
  899. list1.append(dictionary[1])
  900. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  901. print('--------------------')
  902. # print(namelist)
  903. import json
  904. # JSON data:
  905. x = nothing
  906. # python object to be appended
  907. y = {"image": dictionary[1]}
  908. # parsing JSON string:
  909. z = json.loads(x)
  910. # appending the data
  911. z.update(y)
  912. # the result is a JSON string:
  913. # print(json.dumps(z))
  914. zlist=[]
  915. zlist.append(z)
  916. #############################################creating csv#####################################
  917. print(final)
  918. print(imagelist)
  919. final.append('image--' + str(imagelist))
  920. import requests
  921. import json
  922. url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
  923. # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" #testing
  924. # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
  925. # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
  926. # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  927. payload1 = json.dumps(zlist)
  928. # print('--------------------------------------------------------------------------')
  929. #print(payload1)
  930. headers = {
  931. #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
  932. 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
  933. # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
  934. # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  935. # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
  936. 'Content-Type': 'application/json'
  937. }
  938. response = requests.request("POST", url, headers=headers, data=payload1)
  939. # print("##############################################################")
  940. #print(payload1)
  941. print(response.text)
  942. import os
  943. if 'BusinessCards Created Successfully' in response.text:
  944. print('present')
  945. os.remove(found)
  946. else:
  947. print('not present')
  948. df1.to_json('visitingcard.json')
  949. data = df1.to_json('visiting.json', orient='records')
  950. print(data)
  951. #return render_template('index.html')
  952. return response.text
  953. # return 'done'
  954. if __name__ == "__main__":
  955. app.run(host='0.0.0.0', port=1112)