No Description
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Business_cards.py 43KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117
  1. from flask import Flask, render_template, request, redirect, Response, send_file
  2. import os
  3. import openai
  4. import requests
  5. import pandas as pd
  6. import pgeocode
  7. from email_scraper import scrape_emails
  8. import phonenumbers
  9. from pdfminer.high_level import extract_text
  10. import pytesseract
  11. import time
  12. import multiprocessing
  13. from PIL import Image
  14. from functools import partial
  15. from urlextract import URLExtract
  16. import pytesseract as tess
  17. from PIL import Image
  18. import os
  19. import glob
  20. from pytesseract import *
  21. import shutil
  22. import cv2
  23. import matplotlib
  24. from werkzeug.utils import secure_filename
  25. import requests
  26. import spacy
  27. import time
  28. import multiprocessing
  29. from PIL import Image
  30. from functools import partial
  31. import pandas as pd
  32. ################################################################
  33. Current_Working_Directory=os.getcwd()
  34. Current_Working_Directory=Current_Working_Directory.replace("\\","/")
  35. nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")
  36. ################################################################
  37. # import spacy
  38. # nlp_model1 = spacy.load('./ADD3001.2')
  39. from flair.data import Sentence
  40. from flair.models import SequenceTagger
  41. from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
  42. tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  43. model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  44. from paddleocr import PaddleOCR, draw_ocr
  45. ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True)
  46. tagger = SequenceTagger.load("flair/ner-english-large")
  47. import datetime
  48. app = Flask(__name__)
  49. # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
  50. @app.route('/', methods=['GET'])
  51. def card():
  52. return render_template('card.html')
  53. @app.route('/upload_BusinessCards', methods=["POST"])
  54. # @app.route('/multiplecards', methods=["POST"])
  55. def multiplecards():
  56. # print('################## multiple card detection #######################')
  57. # print(Dataset)
  58. datalist=[]
  59. zlist=[]
  60. Dataset = request.get_json()
  61. # print(data)
  62. #datalist.append(Dataset)
  63. data = {'visiting': Dataset}
  64. for i in data['visiting']:
  65. import time
  66. # time.sleep(1)
  67. a = i
  68. x = a['FileData']
  69. # print(x)
  70. y = a['FileName']
  71. z = a['FileType']
  72. # CreatedBy=a['CreatedBy']
  73. name = y + '.' + z
  74. # print(name)
  75. # print(y)
  76. # image = y.split("/")
  77. # filename=image[-1]
  78. # print(x)
  79. img_data = x.encode()
  80. import base64
  81. with open('./multicards/' + name, "wb") as fh:
  82. fh.write(base64.decodebytes(img_data))
  83. # print(i)
  84. # import os
  85. # import glob
  86. # for i in glob.glob('./multipleupload/*'):
  87. found = './multicards/' + name
  88. print(found)
  89. extension = found.split('.')[-1]
  90. # for root, dirs, fils in os.glob('./multipleupload'):
  91. # for name in files:
  92. # foundfile= os.path.join(root, name)
  93. # print(foundfile)
  94. import re
  95. import csv
  96. import glob
  97. import os
  98. # import pytesseract
  99. # import cv2
  100. import numpy as np
  101. import glob
  102. import os
  103. import cv2
  104. import requests
  105. final = []
  106. # final.append('assignto--'+CreatedBy)
  107. imagelist = []
  108. # print(found)
  109. remove_list = []
  110. import os
  111. import glob
  112. import pdfminer
  113. # import os
  114. # ts = 0
  115. # for file_name in glob.glob('./upload/*'):
  116. # fts = os.path.getmtime(file_name)
  117. # if fts > ts:
  118. # ts = fts
  119. # found = file_name
  120. # print(found)
  121. # print(extension)
  122. def org_name():
  123. print('org_name is working')
  124. import pytesseract
  125. fname = found
  126. if extension != 'pdf':
  127. img = cv2.imread(fname)
  128. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  129. cv2.imwrite(str(found), img)
  130. from PIL import Image
  131. im = Image.open(found)
  132. im.save("images1.png", dpi=(1200, 1200))
  133. # import pytesseract
  134. fname = "images1.png"
  135. import pytesseract as tess
  136. from PIL import Image
  137. tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  138. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  139. with open("demo.pdf", "w+b", ) as f:
  140. f.write(pdf)
  141. from pdfminer.high_level import extract_text
  142. text = extract_text('demo.pdf')
  143. # doc = DocumentFile.from_images(found)
  144. # result = model(doc)
  145. # text = result.render()
  146. # from pdfminer.high_level import extract_text
  147. # txt = extract_text('demo.pdf')
  148. else:
  149. from pdfminer.high_level import extract_text
  150. text = extract_text(fname)
  151. sentence = Sentence(text)
  152. # predict NER tags
  153. tagger.predict(sentence)
  154. # print sentence
  155. ko = (sentence)
  156. ko1 = str(ko).split("→")
  157. import pandas as pd
  158. dfg = []
  159. try:
  160. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  161. # os.remove(found)
  162. # return 'Invalid image'
  163. dfg.append(s)
  164. df = pd.DataFrame(dfg)
  165. df = df[0]
  166. df.to_csv("df.csv", index=False)
  167. df1 = pd.read_csv("df.csv")
  168. ve = df1["0"].str.split(",")
  169. fgf = ve.to_list()
  170. dfgh = pd.DataFrame(fgf[0])
  171. maindf = dfgh[0] # .str.split(":")
  172. # maindf.to_csv("main.csv")
  173. main1 = maindf.to_list()
  174. main1
  175. # cv=pd.DataFrame(ve)
  176. # cv
  177. per = ["PER"]
  178. org = ["ORG"]
  179. loc = ["LOC"]
  180. organizations = [i for i in main1 for j in org if j in i]
  181. PErsons = [i for i in main1 for j in per if j in i]
  182. location = [i for i in main1 for j in loc if j in i]
  183. except IndexError:
  184. pass
  185. # ************************************* ORGANIZATION ********************************************************************
  186. def organisation():
  187. print('organisation working ')
  188. try:
  189. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  190. '').replace(
  191. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  192. '').replace(
  193. '.com', ''))) < 4:
  194. pass
  195. else:
  196. match = str(urlfinal[0]).lower()
  197. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  198. 'https',
  199. '').replace(
  200. 'http', '').replace(":", "").replace("/", "").upper()
  201. print(match)
  202. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  203. '') + " /" + \
  204. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  205. s1 = s1g.upper()
  206. s2 = match.upper()
  207. from difflib import SequenceMatcher
  208. print(s1)
  209. print(s2)
  210. print(SequenceMatcher(None, s1, s2).ratio())
  211. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  212. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  213. final.append(
  214. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  215. '').replace(
  216. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  217. '').replace(
  218. '.com',
  219. '').replace(']', ''))
  220. else:
  221. final.append("OrganizationName--" + s2)
  222. except IndexError:
  223. try:
  224. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  225. '').replace(
  226. '"',
  227. '').replace(
  228. '.com', '').replace('.in', ''))) < 4:
  229. pass
  230. else:
  231. match = str(urlfinal[0]).lower()
  232. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  233. '').replace(
  234. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  235. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  236. s1 = s1g.upper()
  237. s2 = match.upper()
  238. from difflib import SequenceMatcher
  239. print(s1)
  240. print(s2)
  241. print(SequenceMatcher(None, s1, s2).ratio())
  242. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  243. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  244. final.append(
  245. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
  246. '[',
  247. '').replace(
  248. ']', '').replace(
  249. '.com', ''))
  250. else:
  251. final.append("OrganizationName--" + s2)
  252. except IndexError:
  253. try:
  254. match = str(urlfinal[0]).lower()
  255. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  256. '').upper()
  257. final.append("OrganizationName--" + match)
  258. # remove_list.append(match)
  259. except IndexError:
  260. company()
  261. #################################################company Name########################################
  262. def company():
  263. print('company list working')
  264. import re
  265. new = []
  266. with open('test.txt', 'r+') as f:
  267. flag = False
  268. for line in f:
  269. line = line.upper()
  270. matches = re.findall(
  271. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  272. line)
  273. for i in matches:
  274. if i in line:
  275. flag = True
  276. if flag:
  277. o = "OrganizationName--" + line
  278. new.append(o)
  279. # if line.startswith('\n'):
  280. # flag = False
  281. try:
  282. a = new[0].replace('\n', '')
  283. final.append(a)
  284. except IndexError:
  285. final.append("OrganizationName--")
  286. # ************************************* CONTACT PERSON *******************************************************************
  287. def contactpersonname():
  288. print('contactpersonname working')
  289. try:
  290. final.append(
  291. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
  292. "]",
  293. "") + '/' +
  294. PErsons[
  295. 1].replace(":PER", "").replace('"', ''))
  296. except IndexError:
  297. try:
  298. final.append(
  299. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
  300. "").replace(
  301. '"', ''))
  302. except IndexError:
  303. final.append("CONTACTPERSONNAME--")
  304. def image_to_text():
  305. # doc = DocumentFile.from_images(found)
  306. # result = model(doc)
  307. # image_to_text.txt = result.render()
  308. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  309. # img = Image.open(found)
  310. # text = tess.image_to_string(img)
  311. # image_to_text.txt = text
  312. # print(text)
  313. import cv2
  314. img_path = found
  315. img = cv2.imread(img_path)
  316. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  317. cv2.imwrite(str(found), img)
  318. result = ocr.ocr(img_path, cls=True)
  319. result = result[0]
  320. txts = [line[1][0] for line in result]
  321. image_to_text.txt = ""
  322. for i in txts:
  323. if len(i) < 4:
  324. continue
  325. # print(i+"\n")
  326. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  327. # print(image_to_text.txt)
  328. def pdf_to_text():
  329. from pdfminer.high_level import extract_text
  330. pdf_to_text.txt = extract_text(found)
  331. # pdf_to_text.txt= text.replace('\n', ' ')
  332. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  333. if extension in extensionlist:
  334. print('image' + extension)
  335. image_to_text()
  336. x = image_to_text.txt
  337. else:
  338. print('pdf' + extension)
  339. pdf_to_text()
  340. x = pdf_to_text.txt
  341. verticaltext = x
  342. htext = x
  343. # print('------------------------------------------------')
  344. #print('############################################################# this is verticaltext #################################################################')
  345. # print(verticaltext)
  346. htext = htext.replace('\n', ' ')
  347. # print('############################################################# this is htext #############################################################')
  348. #print(htext)
  349. y = x.replace('\n', ',')
  350. y = y.replace(' ', ' ')
  351. # y = y.replace(".", " .")
  352. horizontaltext = y
  353. # print('------------------------------------------------')
  354. #print('############################################################# this is horizontaltext #############################################################')
  355. #print(horizontaltext)
  356. textfile = open("test123456.txt", "w")
  357. a = textfile.write(verticaltext)
  358. textfile.close()
  359. textfile = open("vtext.txt", "w")
  360. a = textfile.write(horizontaltext)
  361. textfile.close()
  362. with open('test123456.txt', 'r') as f:
  363. with open('test.txt', 'w') as w:
  364. for line in f:
  365. if line.strip().replace('|', ''):
  366. w.write(line)
  367. ###########################ADDRESS##################################
  368. addrespinlst = []
  369. def splitaddress():
  370. import re
  371. textaddress = htext.replace('\n', ' ')
  372. # print(textaddress)
  373. address1 = (textaddress.partition(",")[0])
  374. words = address1.split()
  375. address1 = words[-1]
  376. addre = (htext.partition(",")[2])
  377. a = addre.replace('\n', ' ').replace('\x0c', '')
  378. addre = (a.partition(",")[2])
  379. matches = re.findall(
  380. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  381. a)
  382. for match in matches:
  383. address2 = match
  384. address2 = str(address2)
  385. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
  386. '')
  387. matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  388. for address3 in matches:
  389. pass
  390. try:
  391. Address = address1 + "," + address2 + "," + address3
  392. final.append('ADDRESS--' + Address)
  393. addrespinlst.append(Address)
  394. except NameError:
  395. print(
  396. '############################################################ Addressmodelworking #############################################################')
  397. # doc = nlp_model1(textaddress)
  398. # addlist = []
  399. # for ent in doc.ents:
  400. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  401. # addlist.append(name)
  402. # try:
  403. # Address = addlist[0]
  404. # final.append(Address)
  405. # addrespinlst.append(Address)
  406. # remove_list.append(
  407. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  408. # "ADDRESS--",
  409. # ""))
  410. # except IndexError:
  411. # final.append("ADDRESS--")
  412. pass
  413. ################################################## website#######################################################
  414. # import re
  415. # url = []
  416. # matches = re.findall(r'www.*', verticaltext)
  417. # for match in matches:
  418. # if (match.count('.')) == 1:
  419. # a_string1 = match.replace("www", "www.")
  420. # final.append("Urls--" + a_string1)
  421. # url.append(a_string1)
  422. # else:
  423. # final.append("Urls--" + match)
  424. # if len(url)==0:
  425. # from urlextract import URLExtract
  426. # extractor = URLExtract()
  427. # urls = extractor.find_urls(verticaltext)
  428. # try:
  429. # urllist = urls[0]
  430. # final.append("Urls--"+urllist)
  431. # url.append(urllist)
  432. # except IndexError:
  433. # final.append("Urls--")
  434. # for match in matches:
  435. # if (match.count('.')) == 1:
  436. # a_string1 = match.replace("www", "www.")
  437. # final.append("Urls--" + a_string1)
  438. # url.append(a_string1)
  439. # else:
  440. # final.append("Urls--" + match)
  441. # url.append(match)
  442. # remove_list.append(match)
  443. # else:
  444. # final.append("Urls--" )
  445. ################################################## website#######################################################
  446. import re
  447. # final=[]
  448. url = []
  449. urlfinal = []
  450. matches = re.findall(r'www.*', verticaltext)
  451. for match in matches:
  452. if (match.count('.')) == 1:
  453. a_string1 = match.replace("www", "www.")
  454. # final.append("Urls--" + a_string1)
  455. url.append(a_string1)
  456. else:
  457. url.append(match)
  458. if len(url) == 0:
  459. from urlextract import URLExtract
  460. extractor = URLExtract()
  461. urls = extractor.find_urls(verticaltext)
  462. try:
  463. urllist = urls[0]
  464. url.append(urllist)
  465. url.append(urllist)
  466. except IndexError:
  467. pass
  468. for match in matches:
  469. if (match.count('.')) == 1:
  470. a_string1 = match.replace("www", "www.")
  471. url.append(a_string1)
  472. # url.append(a_string1)
  473. else:
  474. url.append(match)
  475. url.append(match)
  476. else:
  477. pass
  478. try:
  479. test_string = url[0]
  480. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  481. res = [ele for ele in test_list if (ele in test_string)]
  482. if len(res) == 0:
  483. print('no match')
  484. final.append('urls--')
  485. else:
  486. print('matched')
  487. final.append('urls--' + url[0])
  488. urlfinal.append(url[0])
  489. except IndexError:
  490. final.append('urls--')
  491. print(
  492. '############################################################# url #############################################################')
  493. print(url)
  494. #######organisation and contact################
  495. # def company_url():
  496. # # print('--url--')
  497. # # print(url)
  498. # try:
  499. # match = str(url[0]).lower()
  500. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  501. # final.append("OrganizationName--" + match)
  502. # # remove_list.append(match)
  503. # except IndexError:
  504. # org_name()
  505. # organisation()
  506. # final.append("OrganizationName--")
  507. # make example sentence
  508. # print(horizontaltext)
  509. sentence = Sentence(verticaltext)
  510. # predict NER tags
  511. tagger.predict(sentence)
  512. # print sentence
  513. ko = (sentence)
  514. ko1 = str(ko).split("→")
  515. import pandas as pd
  516. dfg = []
  517. try:
  518. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  519. except IndexError:
  520. os.remove(found)
  521. return 'Invalid image'
  522. dfg.append(s)
  523. df = pd.DataFrame(dfg)
  524. df = df[0]
  525. df.to_csv("df.csv", index=False)
  526. df1 = pd.read_csv("df.csv")
  527. ve = df1["0"].str.split(",")
  528. fgf = ve.to_list()
  529. dfgh = pd.DataFrame(fgf[0])
  530. maindf = dfgh[0] # .str.split(":")
  531. # maindf.to_csv("main.csv")
  532. main1 = maindf.to_list()
  533. main1
  534. # cv=pd.DataFrame(ve)
  535. # cv
  536. per = ["PER"]
  537. org = ["ORG"]
  538. loc = ["LOC"]
  539. organizations = [i for i in main1 for j in org if j in i]
  540. PErsons = [i for i in main1 for j in per if j in i]
  541. location = [i for i in main1 for j in loc if j in i]
  542. # ************************************* ORGANIZATION ********************************************************************
  543. try:
  544. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  545. '').replace(
  546. ']', '').replace(
  547. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  548. pass
  549. # company_url()
  550. else:
  551. match = str(urlfinal[0]).lower()
  552. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  553. 'https',
  554. '').replace(
  555. 'http', '').replace(":", "").replace("/", "").upper()
  556. print(match)
  557. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  558. '.com', '') + " /" + \
  559. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  560. s1 = s1g.upper()
  561. s2 = match.upper()
  562. from difflib import SequenceMatcher
  563. print(s1)
  564. print(s2)
  565. print(SequenceMatcher(None, s1, s2).ratio())
  566. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  567. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  568. final.append(
  569. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  570. '').replace(
  571. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  572. '').replace(
  573. '.com', '').replace(']', ''))
  574. else:
  575. final.append("OrganizationName--" + s2)
  576. except IndexError:
  577. try:
  578. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  579. '').replace(
  580. '"',
  581. '').replace(
  582. '.com', ''))) < 4:
  583. pass
  584. # company_url()
  585. else:
  586. match = str(urlfinal[0]).lower()
  587. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  588. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  589. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
  590. '').replace(
  591. '.com', '')
  592. s1 = s1g.upper()
  593. s2 = match.upper()
  594. from difflib import SequenceMatcher
  595. print(s1)
  596. print(s2)
  597. print(SequenceMatcher(None, s1, s2).ratio())
  598. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  599. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  600. final.append(
  601. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  602. '').replace(
  603. ']', '').replace(
  604. '.com', '').replace(']', ''))
  605. else:
  606. final.append("OrganizationName--" + s2)
  607. except IndexError:
  608. org_name()
  609. organisation()
  610. # final.append("OrganizationName--")
  611. # ************************************* CONTACT PERSON *******************************************************************
  612. try:
  613. final.append(
  614. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  615. "") +
  616. PErsons[
  617. 1].replace(":PER", "").replace('"', ''))
  618. except IndexError:
  619. try:
  620. final.append(
  621. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  622. '"',
  623. ''))
  624. except IndexError:
  625. org_name()
  626. contactpersonname()
  627. # final.append("CONTACTPERSONNAME--")
  628. ###############address flair#####################
  629. try:
  630. print(
  631. '############################################################# address new code #############################################################')
  632. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  633. loclst = [i for i in loactionlst if i in htext.lower()]
  634. textaddress = htext
  635. textaddress = textaddress.replace("|", ",")
  636. textaddress = textaddress.lower()
  637. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  638. grop = nlp(textaddress)
  639. citycountry = []
  640. print('########################### city or country name ###########################')
  641. d = grop[-1]
  642. if d['entity_group'] == "COUNTRY":
  643. print(d["word"])
  644. citycountry.append(d["word"])
  645. elif d['entity_group'] == "CITY":
  646. print(d["word"])
  647. citycountry.append(d["word"])
  648. try:
  649. address1 = loclst[0]
  650. except IndexError:
  651. address1 = (textaddress.partition(",")[0])
  652. words = address1.split()
  653. address1 = words[-1]
  654. star_location = address1.lower()
  655. end_location = citycountry[0].replace("#", "")
  656. start = star_location
  657. end = end_location
  658. s = textaddress.lower()
  659. middle_address = (s.split(start))[-1].split(end)[0]
  660. Address = start + middle_address + end
  661. Address = Address.replace('--', '').title()
  662. print(Address)
  663. if Address.count(',') < 2:
  664. splitaddress()
  665. else:
  666. final.append('ADDRESS--' + Address)
  667. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  668. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  669. # d1 = star_location.split()
  670. # d2 = end_location.split()
  671. # d3 = d1[0]
  672. # d4 = d2[0]
  673. # start = d3
  674. # end = d4
  675. # s = horizontaltext
  676. # middle_address = ((s.split(start))[1].split(end)[0])
  677. # Address = d3 + middle_address + d4
  678. # final.append('ADDRESS--' + Address)
  679. # addrespinlst.append(Address)
  680. except IndexError:
  681. splitaddress()
  682. ########################################## Designation ###########################################
  683. import re
  684. new = []
  685. with open('test.txt', 'r') as f:
  686. flag = False
  687. for line in f:
  688. line1 = line
  689. line = line.upper()
  690. matches = re.findall(
  691. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  692. line)
  693. for match in matches:
  694. line = line.replace('-', '')
  695. # print(line)
  696. o = "Designation--" + line
  697. new.append(o)
  698. remove_list.append(str(line1).replace('\n', ''))
  699. try:
  700. a = new[0].replace('\n', '')
  701. final.append(a)
  702. except IndexError:
  703. final.append("Designation--")
  704. ###################################################Phone number#################################################
  705. num = []
  706. import phonenumbers
  707. # print(verticaltext)
  708. numbers = phonenumbers.PhoneNumberMatcher(
  709. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
  710. for number in numbers:
  711. number = str(number).split(")")
  712. num.append(number[1])
  713. # num.append(number[-1])
  714. if len(num) == 0:
  715. final.append("ContactNumber--")
  716. final.append("OrganizationNumber--")
  717. elif len(num) > 1:
  718. final.append("ContactNumber--" + num[0].replace(' ', ''))
  719. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  720. elif len(num) == 1:
  721. try:
  722. final.append("ContactNumber--" + num[0].replace(' ', ''))
  723. final.append("OrganizationNumber--")
  724. except IndexError:
  725. final.append("ContactNumber--")
  726. final.append("OrganizationNumber--")
  727. print(
  728. '############################################################# num #############################################################')
  729. print(num)
  730. # try:
  731. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  732. # remove_list.append(num[0])
  733. # except IndexError:
  734. # pass
  735. # try:
  736. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  737. # remove_list.append(num[1])
  738. # except IndexError:
  739. # pass
  740. # try:
  741. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  742. # remove_list.append(num[2])
  743. # except IndexError:
  744. # pass
  745. ################################################### Email######################################################
  746. import re
  747. from email_scraper import scrape_emails
  748. s = list(scrape_emails(horizontaltext))
  749. email_id = s
  750. # email_id = []
  751. # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
  752. # for match in matches:
  753. # email_id.append(match)
  754. # # final.append('Email--' + match)
  755. # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
  756. # # final.append(email_)
  757. # # final.append('Email--' + email_)
  758. # # remove_list.append(email_)
  759. if len(email_id) > 1:
  760. final.append(
  761. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  762. ""))
  763. final.append(
  764. 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  765. "'",
  766. ""))
  767. else:
  768. try:
  769. final.append(
  770. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  771. "'",
  772. ""))
  773. final.append('OrganizationEmail--')
  774. except IndexError:
  775. final.append('ContactEmail--')
  776. final.append('OrganizationEmail--')
  777. ###############PINCODE############
  778. pinlst = []
  779. print(addrespinlst)
  780. import pgeocode
  781. # try:
  782. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  783. # for i in matche1:
  784. # address3 = i.replace(' ', '').replace('-', '')
  785. # pinlst.append(address3)
  786. # except IndexError:
  787. lst = []
  788. for i in num:
  789. i = i[1:]
  790. lst.append(i)
  791. infile = r"vtext.txt"
  792. outfile = r"cleaned_file.txt"
  793. import glob
  794. delete_list = lst
  795. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  796. fin = open(infile, "r+")
  797. fout = open(outfile, "w+")
  798. for line12 in fin:
  799. for word in delete_list:
  800. line12 = line12.replace(word, "")
  801. fout.write(line12)
  802. fin.close()
  803. # print(line)
  804. # print(addrespinlst)
  805. import pgeocode
  806. #print(line12)
  807. import re
  808. matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  809. for i in matche1:
  810. address3 = i.replace(' ', '').replace('-', '')
  811. pinlst.append(address3)
  812. nomi = pgeocode.Nominatim('IN')
  813. try:
  814. a = nomi.query_postal_code(str(pinlst[-1]))
  815. # print(a)
  816. b = a.keys()
  817. c = b.values.tolist()
  818. d = a.tolist()
  819. postal_code = "PinCode1" + "--" + d[0]
  820. final.append(postal_code)
  821. country_code = c[1] + "--" + str(d[1])
  822. final.append(country_code)
  823. place_name = 'LandMark1' + "--" + str(d[2])
  824. final.append(place_name)
  825. state_name = c[3] + "--" + str(d[3])
  826. final.append(state_name)
  827. state_code = c[4] + "--" + str(d[4])
  828. final.append(state_code)
  829. county_name = 'CityName1' + "--" + str(d[5])
  830. final.append(county_name)
  831. except (IndexError, NameError):
  832. final.append("PinCode1--")
  833. final.append("country_code--")
  834. final.append("LandMark1--")
  835. final.append("state_name--")
  836. final.append("state_code--")
  837. final.append("CityName1--")
  838. ######################################################## json #####################################################################
  839. import pandas as pd
  840. df = pd.DataFrame(final)
  841. df1 = df[0].str.split('--', expand=True)
  842. # print(df1)
  843. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  844. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  845. df1['Keys'] = df1['Keys'].str.strip()
  846. df1.to_csv('path123.csv', index=False)
  847. df2 = pd.read_csv('path123.csv')
  848. print(df2)
  849. df2 = df2.T
  850. df2.to_csv('path1.csv', index=False, header=False)
  851. df1 = pd.read_csv('path1.csv')
  852. df1.to_json('firstjson1.json', orient="index")
  853. import json
  854. with open('firstjson1.json', 'r') as json_file:
  855. json_load = json.load(json_file)
  856. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  857. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  858. # # print('--------------------------------------------------------------------------')
  859. # # print(nothing)
  860. empty = []
  861. import base64
  862. name = found
  863. image = open(name, 'rb')
  864. image_read = image.read()
  865. image_64_encode = base64.b64encode(image_read)
  866. NULL = 'null'
  867. empty.append("ByteData--" + (NULL).strip('""'))
  868. image_64_encode = image_64_encode.decode('utf-8')
  869. empty.append("FileData--" + str(image_64_encode))
  870. imagedata = name.split("/")
  871. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  872. imagename1 = str(imagename).split('.')
  873. imagename = str(imagename1[-2]).replace("[", "]")
  874. empty.append("FileName--" + imagename)
  875. empty.append("FilePath--"+ "")
  876. imageExtension = str(imagename1[-1]).replace("[", "]")
  877. empty.append("FileType--" + imageExtension)
  878. image.close()
  879. import pandas as pd
  880. df = pd.DataFrame(empty)
  881. df = df[0].str.split("--", expand=True)
  882. data1 = pd.DataFrame(df[0])
  883. data2 = pd.DataFrame(df[1])
  884. dt = data2.set_index(data1[0])
  885. dt4 = dt.T
  886. dictionary = dt4.to_dict(orient="index")
  887. list1 = []
  888. # list.append(a)
  889. list1.append(dictionary[1])
  890. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  891. print('--------------------')
  892. # print(namelist)
  893. import json
  894. # JSON data:
  895. x = nothing
  896. # python object to be appended
  897. y = {"image": dictionary[1]}
  898. # parsing JSON string:
  899. z = json.loads(x)
  900. # appending the data
  901. z.update(y)
  902. # the result is a JSON string:
  903. # print(json.dumps(z))
  904. zlist.append(z)
  905. #############################################creating csv#####################################
  906. #print(final)
  907. #print(imagelist)
  908. #final.append('image--' + str(imagelist))
  909. # import requests
  910. # import json
  911. # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
  912. # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing
  913. # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
  914. # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
  915. # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  916. # payload1 = json.dumps(zlist)
  917. # # print('--------------------------------------------------------------------------')
  918. # #print(payload1)
  919. # headers = {
  920. # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
  921. # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
  922. # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
  923. # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  924. # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
  925. # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af',
  926. # 'Content-Type': 'application/json'
  927. # }
  928. # response = requests.request("POST", url, headers=headers, data=payload1)
  929. # # print("##############################################################")
  930. # print(payload1)
  931. # #print(zlist)
  932. # # import os
  933. # # if 'BusinessCards Created Successfully' in response.text:
  934. # # print('present')
  935. # # os.remove(found)
  936. # # else:
  937. # # print('not present')
  938. # df1.to_json('visitingcard.json')
  939. # data = df1.to_json('visiting.json', orient='records')
  940. # print(data)
  941. #return render_template('index.html')
  942. #return response.text
  943. #return z
  944. return zlist
  945. if __name__ == "__main__":
  946. app.run(host='0.0.0.0', port=1112)