Ingen beskrivning
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

Business_cards.py 43KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119
  1. from flask import Flask, render_template, request, redirect, Response, send_file
  2. import os
  3. # import openai
  4. import requests
  5. import pandas as pd
  6. import pgeocode
  7. from email_scraper import scrape_emails
  8. import phonenumbers
  9. from pdfminer.high_level import extract_text
  10. import pytesseract
  11. import time
  12. import multiprocessing
  13. from PIL import Image
  14. from functools import partial
  15. from urlextract import URLExtract
  16. import pytesseract as tess
  17. from PIL import Image
  18. import os
  19. import glob
  20. from pytesseract import *
  21. import shutil
  22. import cv2
  23. import matplotlib
  24. from werkzeug.utils import secure_filename
  25. import requests
  26. #import spacy
  27. import time
  28. import multiprocessing
  29. from PIL import Image
  30. from functools import partial
  31. import pandas as pd
  32. ################################################################
  33. Current_Working_Directory=os.getcwd()
  34. Current_Working_Directory=Current_Working_Directory.replace("\\","/")
  35. # nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")
  36. ################################################################
  37. # import spacy
  38. # nlp_model1 = spacy.load('./ADD3001.2')
  39. from flair.data import Sentence
  40. from flair.models import SequenceTagger
  41. from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
  42. tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  43. model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  44. from paddleocr import PaddleOCR, draw_ocr
  45. ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True)
  46. tagger = SequenceTagger.load("flair/ner-english-large")
  47. import datetime
  48. app = Flask(__name__)
  49. # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
  50. @app.route('/', methods=['GET'])
  51. def card():
  52. return render_template('card.html')
  53. @app.route('/upload_BusinessCards', methods=["POST"])
  54. # @app.route('/multiplecards', methods=["POST"])
  55. def multiplecards():
  56. # print('################## multiple card detection #######################')
  57. # print(Dataset)
  58. from pathlib import Path
  59. Path("multicards").mkdir(exist_ok=True)
  60. datalist=[]
  61. zlist=[]
  62. Dataset = request.get_json()
  63. # print(data)
  64. #datalist.append(Dataset)
  65. data = {'visiting': Dataset}
  66. for i in data['visiting']:
  67. import time
  68. # time.sleep(1)
  69. a = i
  70. x = a['FileData']
  71. # print(x)
  72. y = a['FileName']
  73. z = a['FileType']
  74. # CreatedBy=a['CreatedBy']
  75. name = y + '.' + z
  76. # print(name)
  77. # print(y)
  78. # image = y.split("/")
  79. # filename=image[-1]
  80. # print(x)
  81. img_data = x.encode()
  82. import base64
  83. with open('./multicards/' + name, "wb") as fh:
  84. fh.write(base64.decodebytes(img_data))
  85. # print(i)
  86. # import os
  87. # import glob
  88. # for i in glob.glob('./multipleupload/*'):
  89. found = './multicards/' + name
  90. print(found)
  91. extension = found.split('.')[-1]
  92. # for root, dirs, fils in os.glob('./multipleupload'):
  93. # for name in files:
  94. # foundfile= os.path.join(root, name)
  95. # print(foundfile)
  96. import re
  97. import csv
  98. import glob
  99. import os
  100. # import pytesseract
  101. # import cv2
  102. import numpy as np
  103. import glob
  104. import os
  105. import cv2
  106. import requests
  107. final = []
  108. # final.append('assignto--'+CreatedBy)
  109. imagelist = []
  110. # print(found)
  111. remove_list = []
  112. import os
  113. import glob
  114. import pdfminer
  115. # import os
  116. # ts = 0
  117. # for file_name in glob.glob('./upload/*'):
  118. # fts = os.path.getmtime(file_name)
  119. # if fts > ts:
  120. # ts = fts
  121. # found = file_name
  122. # print(found)
  123. # print(extension)
  124. def org_name():
  125. print('org_name is working')
  126. import pytesseract
  127. fname = found
  128. if extension != 'pdf':
  129. img = cv2.imread(fname)
  130. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  131. cv2.imwrite(str(found), img)
  132. from PIL import Image
  133. im = Image.open(found)
  134. im.save("images1.png", dpi=(1200, 1200))
  135. # import pytesseract
  136. fname = "images1.png"
  137. import pytesseract as tess
  138. from PIL import Image
  139. tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  140. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  141. with open("demo.pdf", "w+b", ) as f:
  142. f.write(pdf)
  143. from pdfminer.high_level import extract_text
  144. text = extract_text('demo.pdf')
  145. # doc = DocumentFile.from_images(found)
  146. # result = model(doc)
  147. # text = result.render()
  148. # from pdfminer.high_level import extract_text
  149. # txt = extract_text('demo.pdf')
  150. else:
  151. from pdfminer.high_level import extract_text
  152. text = extract_text(fname)
  153. sentence = Sentence(text)
  154. # predict NER tags
  155. tagger.predict(sentence)
  156. # print sentence
  157. ko = (sentence)
  158. ko1 = str(ko).split("→")
  159. import pandas as pd
  160. dfg = []
  161. try:
  162. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  163. # os.remove(found)
  164. # return 'Invalid image'
  165. dfg.append(s)
  166. df = pd.DataFrame(dfg)
  167. df = df[0]
  168. df.to_csv("df.csv", index=False)
  169. df1 = pd.read_csv("df.csv")
  170. ve = df1["0"].str.split(",")
  171. fgf = ve.to_list()
  172. dfgh = pd.DataFrame(fgf[0])
  173. maindf = dfgh[0] # .str.split(":")
  174. # maindf.to_csv("main.csv")
  175. main1 = maindf.to_list()
  176. main1
  177. # cv=pd.DataFrame(ve)
  178. # cv
  179. per = ["PER"]
  180. org = ["ORG"]
  181. loc = ["LOC"]
  182. organizations = [i for i in main1 for j in org if j in i]
  183. PErsons = [i for i in main1 for j in per if j in i]
  184. location = [i for i in main1 for j in loc if j in i]
  185. except IndexError:
  186. pass
  187. # ************************************* ORGANIZATION ********************************************************************
  188. def organisation():
  189. print('organisation working ')
  190. try:
  191. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  192. '').replace(
  193. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  194. '').replace(
  195. '.com', ''))) < 4:
  196. pass
  197. else:
  198. match = str(urlfinal[0]).lower()
  199. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  200. 'https',
  201. '').replace(
  202. 'http', '').replace(":", "").replace("/", "").upper()
  203. print(match)
  204. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  205. '') + " /" + \
  206. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  207. s1 = s1g.upper()
  208. s2 = match.upper()
  209. from difflib import SequenceMatcher
  210. print(s1)
  211. print(s2)
  212. print(SequenceMatcher(None, s1, s2).ratio())
  213. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  214. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  215. final.append(
  216. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  217. '').replace(
  218. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  219. '').replace(
  220. '.com',
  221. '').replace(']', ''))
  222. else:
  223. final.append("OrganizationName--" + s2)
  224. except IndexError:
  225. try:
  226. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  227. '').replace(
  228. '"',
  229. '').replace(
  230. '.com', '').replace('.in', ''))) < 4:
  231. pass
  232. else:
  233. match = str(urlfinal[0]).lower()
  234. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  235. '').replace(
  236. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  237. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  238. s1 = s1g.upper()
  239. s2 = match.upper()
  240. from difflib import SequenceMatcher
  241. print(s1)
  242. print(s2)
  243. print(SequenceMatcher(None, s1, s2).ratio())
  244. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  245. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  246. final.append(
  247. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
  248. '[',
  249. '').replace(
  250. ']', '').replace(
  251. '.com', ''))
  252. else:
  253. final.append("OrganizationName--" + s2)
  254. except IndexError:
  255. try:
  256. match = str(urlfinal[0]).lower()
  257. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  258. '').upper()
  259. final.append("OrganizationName--" + match)
  260. # remove_list.append(match)
  261. except IndexError:
  262. company()
  263. #################################################company Name########################################
  264. def company():
  265. print('company list working')
  266. import re
  267. new = []
  268. with open('test.txt', 'r+') as f:
  269. flag = False
  270. for line in f:
  271. line = line.upper()
  272. matches = re.findall(
  273. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  274. line)
  275. for i in matches:
  276. if i in line:
  277. flag = True
  278. if flag:
  279. o = "OrganizationName--" + line
  280. new.append(o)
  281. # if line.startswith('\n'):
  282. # flag = False
  283. try:
  284. a = new[0].replace('\n', '')
  285. final.append(a)
  286. except IndexError:
  287. final.append("OrganizationName--")
  288. # ************************************* CONTACT PERSON *******************************************************************
  289. def contactpersonname():
  290. print('contactpersonname working')
  291. try:
  292. final.append(
  293. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
  294. "]",
  295. "") + '/' +
  296. PErsons[
  297. 1].replace(":PER", "").replace('"', ''))
  298. except IndexError:
  299. try:
  300. final.append(
  301. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
  302. "").replace(
  303. '"', ''))
  304. except IndexError:
  305. final.append("CONTACTPERSONNAME--")
  306. def image_to_text():
  307. # doc = DocumentFile.from_images(found)
  308. # result = model(doc)
  309. # image_to_text.txt = result.render()
  310. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  311. # img = Image.open(found)
  312. # text = tess.image_to_string(img)
  313. # image_to_text.txt = text
  314. # print(text)
  315. import cv2
  316. img_path = found
  317. img = cv2.imread(img_path)
  318. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  319. cv2.imwrite(str(found), img)
  320. result = ocr.ocr(img_path, cls=True)
  321. result = result[0]
  322. txts = [line[1][0] for line in result]
  323. image_to_text.txt = ""
  324. for i in txts:
  325. if len(i) < 4:
  326. continue
  327. # print(i+"\n")
  328. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  329. # print(image_to_text.txt)
  330. def pdf_to_text():
  331. from pdfminer.high_level import extract_text
  332. pdf_to_text.txt = extract_text(found)
  333. # pdf_to_text.txt= text.replace('\n', ' ')
  334. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  335. if extension in extensionlist:
  336. print('image' + extension)
  337. image_to_text()
  338. x = image_to_text.txt
  339. else:
  340. print('pdf' + extension)
  341. pdf_to_text()
  342. x = pdf_to_text.txt
  343. verticaltext = x
  344. htext = x
  345. # print('------------------------------------------------')
  346. #print('############################################################# this is verticaltext #################################################################')
  347. # print(verticaltext)
  348. htext = htext.replace('\n', ' ')
  349. # print('############################################################# this is htext #############################################################')
  350. #print(htext)
  351. y = x.replace('\n', ',')
  352. y = y.replace(' ', ' ')
  353. # y = y.replace(".", " .")
  354. horizontaltext = y
  355. # print('------------------------------------------------')
  356. #print('############################################################# this is horizontaltext #############################################################')
  357. #print(horizontaltext)
  358. textfile = open("test123456.txt", "w")
  359. a = textfile.write(verticaltext)
  360. textfile.close()
  361. textfile = open("vtext.txt", "w")
  362. a = textfile.write(horizontaltext)
  363. textfile.close()
  364. with open('test123456.txt', 'r') as f:
  365. with open('test.txt', 'w') as w:
  366. for line in f:
  367. if line.strip().replace('|', ''):
  368. w.write(line)
  369. ###########################ADDRESS##################################
  370. addrespinlst = []
  371. def splitaddress():
  372. import re
  373. textaddress = htext.replace('\n', ' ')
  374. # print(textaddress)
  375. address1 = (textaddress.partition(",")[0])
  376. words = address1.split()
  377. address1 = words[-1]
  378. addre = (htext.partition(",")[2])
  379. a = addre.replace('\n', ' ').replace('\x0c', '')
  380. addre = (a.partition(",")[2])
  381. matches = re.findall(
  382. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  383. a)
  384. for match in matches:
  385. address2 = match
  386. address2 = str(address2)
  387. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
  388. '')
  389. matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  390. for address3 in matches:
  391. pass
  392. try:
  393. Address = address1 + "," + address2 + "," + address3
  394. final.append('ADDRESS--' + Address)
  395. addrespinlst.append(Address)
  396. except NameError:
  397. print(
  398. '############################################################ Addressmodelworking #############################################################')
  399. # doc = nlp_model1(textaddress)
  400. # addlist = []
  401. # for ent in doc.ents:
  402. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  403. # addlist.append(name)
  404. # try:
  405. # Address = addlist[0]
  406. # final.append(Address)
  407. # addrespinlst.append(Address)
  408. # remove_list.append(
  409. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  410. # "ADDRESS--",
  411. # ""))
  412. # except IndexError:
  413. # final.append("ADDRESS--")
  414. pass
  415. ################################################## website#######################################################
  416. # import re
  417. # url = []
  418. # matches = re.findall(r'www.*', verticaltext)
  419. # for match in matches:
  420. # if (match.count('.')) == 1:
  421. # a_string1 = match.replace("www", "www.")
  422. # final.append("Urls--" + a_string1)
  423. # url.append(a_string1)
  424. # else:
  425. # final.append("Urls--" + match)
  426. # if len(url)==0:
  427. # from urlextract import URLExtract
  428. # extractor = URLExtract()
  429. # urls = extractor.find_urls(verticaltext)
  430. # try:
  431. # urllist = urls[0]
  432. # final.append("Urls--"+urllist)
  433. # url.append(urllist)
  434. # except IndexError:
  435. # final.append("Urls--")
  436. # for match in matches:
  437. # if (match.count('.')) == 1:
  438. # a_string1 = match.replace("www", "www.")
  439. # final.append("Urls--" + a_string1)
  440. # url.append(a_string1)
  441. # else:
  442. # final.append("Urls--" + match)
  443. # url.append(match)
  444. # remove_list.append(match)
  445. # else:
  446. # final.append("Urls--" )
  447. ################################################## website#######################################################
  448. import re
  449. # final=[]
  450. url = []
  451. urlfinal = []
  452. matches = re.findall(r'www.*', verticaltext)
  453. for match in matches:
  454. if (match.count('.')) == 1:
  455. a_string1 = match.replace("www", "www.")
  456. # final.append("Urls--" + a_string1)
  457. url.append(a_string1)
  458. else:
  459. url.append(match)
  460. if len(url) == 0:
  461. from urlextract import URLExtract
  462. extractor = URLExtract()
  463. urls = extractor.find_urls(verticaltext)
  464. try:
  465. urllist = urls[0]
  466. url.append(urllist)
  467. url.append(urllist)
  468. except IndexError:
  469. pass
  470. for match in matches:
  471. if (match.count('.')) == 1:
  472. a_string1 = match.replace("www", "www.")
  473. url.append(a_string1)
  474. # url.append(a_string1)
  475. else:
  476. url.append(match)
  477. url.append(match)
  478. else:
  479. pass
  480. try:
  481. test_string = url[0]
  482. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  483. res = [ele for ele in test_list if (ele in test_string)]
  484. if len(res) == 0:
  485. print('no match')
  486. final.append('urls--')
  487. else:
  488. print('matched')
  489. final.append('urls--' + url[0])
  490. urlfinal.append(url[0])
  491. except IndexError:
  492. final.append('urls--')
  493. print(
  494. '############################################################# url #############################################################')
  495. print(url)
  496. #######organisation and contact################
  497. # def company_url():
  498. # # print('--url--')
  499. # # print(url)
  500. # try:
  501. # match = str(url[0]).lower()
  502. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  503. # final.append("OrganizationName--" + match)
  504. # # remove_list.append(match)
  505. # except IndexError:
  506. # org_name()
  507. # organisation()
  508. # final.append("OrganizationName--")
  509. # make example sentence
  510. # print(horizontaltext)
  511. sentence = Sentence(verticaltext)
  512. # predict NER tags
  513. tagger.predict(sentence)
  514. # print sentence
  515. ko = (sentence)
  516. ko1 = str(ko).split("→")
  517. import pandas as pd
  518. dfg = []
  519. try:
  520. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  521. except IndexError:
  522. os.remove(found)
  523. return 'Invalid image'
  524. dfg.append(s)
  525. df = pd.DataFrame(dfg)
  526. df = df[0]
  527. df.to_csv("df.csv", index=False)
  528. df1 = pd.read_csv("df.csv")
  529. ve = df1["0"].str.split(",")
  530. fgf = ve.to_list()
  531. dfgh = pd.DataFrame(fgf[0])
  532. maindf = dfgh[0] # .str.split(":")
  533. # maindf.to_csv("main.csv")
  534. main1 = maindf.to_list()
  535. main1
  536. # cv=pd.DataFrame(ve)
  537. # cv
  538. per = ["PER"]
  539. org = ["ORG"]
  540. loc = ["LOC"]
  541. organizations = [i for i in main1 for j in org if j in i]
  542. PErsons = [i for i in main1 for j in per if j in i]
  543. location = [i for i in main1 for j in loc if j in i]
  544. # ************************************* ORGANIZATION ********************************************************************
  545. try:
  546. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  547. '').replace(
  548. ']', '').replace(
  549. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  550. pass
  551. # company_url()
  552. else:
  553. match = str(urlfinal[0]).lower()
  554. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  555. 'https',
  556. '').replace(
  557. 'http', '').replace(":", "").replace("/", "").upper()
  558. print(match)
  559. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  560. '.com', '') + " /" + \
  561. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  562. s1 = s1g.upper()
  563. s2 = match.upper()
  564. from difflib import SequenceMatcher
  565. print(s1)
  566. print(s2)
  567. print(SequenceMatcher(None, s1, s2).ratio())
  568. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  569. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  570. final.append(
  571. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  572. '').replace(
  573. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  574. '').replace(
  575. '.com', '').replace(']', ''))
  576. else:
  577. final.append("OrganizationName--" + s2)
  578. except IndexError:
  579. try:
  580. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  581. '').replace(
  582. '"',
  583. '').replace(
  584. '.com', ''))) < 4:
  585. pass
  586. # company_url()
  587. else:
  588. match = str(urlfinal[0]).lower()
  589. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  590. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  591. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
  592. '').replace(
  593. '.com', '')
  594. s1 = s1g.upper()
  595. s2 = match.upper()
  596. from difflib import SequenceMatcher
  597. print(s1)
  598. print(s2)
  599. print(SequenceMatcher(None, s1, s2).ratio())
  600. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  601. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  602. final.append(
  603. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  604. '').replace(
  605. ']', '').replace(
  606. '.com', '').replace(']', ''))
  607. else:
  608. final.append("OrganizationName--" + s2)
  609. except IndexError:
  610. org_name()
  611. organisation()
  612. # final.append("OrganizationName--")
  613. # ************************************* CONTACT PERSON *******************************************************************
  614. try:
  615. final.append(
  616. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  617. "") +
  618. PErsons[
  619. 1].replace(":PER", "").replace('"', ''))
  620. except IndexError:
  621. try:
  622. final.append(
  623. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  624. '"',
  625. ''))
  626. except IndexError:
  627. org_name()
  628. contactpersonname()
  629. # final.append("CONTACTPERSONNAME--")
  630. ###############address flair#####################
  631. try:
  632. print(
  633. '############################################################# address new code #############################################################')
  634. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  635. loclst = [i for i in loactionlst if i in htext.lower()]
  636. textaddress = htext
  637. textaddress = textaddress.replace("|", ",")
  638. textaddress = textaddress.lower()
  639. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  640. grop = nlp(textaddress)
  641. citycountry = []
  642. print('########################### city or country name ###########################')
  643. d = grop[-1]
  644. if d['entity_group'] == "COUNTRY":
  645. print(d["word"])
  646. citycountry.append(d["word"])
  647. elif d['entity_group'] == "CITY":
  648. print(d["word"])
  649. citycountry.append(d["word"])
  650. try:
  651. address1 = loclst[0]
  652. except IndexError:
  653. address1 = (textaddress.partition(",")[0])
  654. words = address1.split()
  655. address1 = words[-1]
  656. star_location = address1.lower()
  657. end_location = citycountry[0].replace("#", "")
  658. start = star_location
  659. end = end_location
  660. s = textaddress.lower()
  661. middle_address = (s.split(start))[-1].split(end)[0]
  662. Address = start + middle_address + end
  663. Address = Address.replace('--', '').title()
  664. print(Address)
  665. if Address.count(',') < 2:
  666. splitaddress()
  667. else:
  668. final.append('ADDRESS--' + Address)
  669. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  670. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  671. # d1 = star_location.split()
  672. # d2 = end_location.split()
  673. # d3 = d1[0]
  674. # d4 = d2[0]
  675. # start = d3
  676. # end = d4
  677. # s = horizontaltext
  678. # middle_address = ((s.split(start))[1].split(end)[0])
  679. # Address = d3 + middle_address + d4
  680. # final.append('ADDRESS--' + Address)
  681. # addrespinlst.append(Address)
  682. except IndexError:
  683. splitaddress()
  684. ########################################## Designation ###########################################
  685. import re
  686. new = []
  687. with open('test.txt', 'r') as f:
  688. flag = False
  689. for line in f:
  690. line1 = line
  691. line = line.upper()
  692. matches = re.findall(
  693. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  694. line)
  695. for match in matches:
  696. line = line.replace('-', '')
  697. # print(line)
  698. o = "Designation--" + line
  699. new.append(o)
  700. remove_list.append(str(line1).replace('\n', ''))
  701. try:
  702. a = new[0].replace('\n', '')
  703. final.append(a)
  704. except IndexError:
  705. final.append("Designation--")
  706. ###################################################Phone number#################################################
  707. num = []
  708. import phonenumbers
  709. # print(verticaltext)
  710. numbers = phonenumbers.PhoneNumberMatcher(
  711. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
  712. for number in numbers:
  713. number = str(number).split(")")
  714. num.append(number[1])
  715. # num.append(number[-1])
  716. if len(num) == 0:
  717. final.append("ContactNumber--")
  718. final.append("OrganizationNumber--")
  719. elif len(num) > 1:
  720. final.append("ContactNumber--" + num[0].replace(' ', ''))
  721. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  722. elif len(num) == 1:
  723. try:
  724. final.append("ContactNumber--" + num[0].replace(' ', ''))
  725. final.append("OrganizationNumber--")
  726. except IndexError:
  727. final.append("ContactNumber--")
  728. final.append("OrganizationNumber--")
  729. print(
  730. '############################################################# num #############################################################')
  731. print(num)
  732. # try:
  733. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  734. # remove_list.append(num[0])
  735. # except IndexError:
  736. # pass
  737. # try:
  738. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  739. # remove_list.append(num[1])
  740. # except IndexError:
  741. # pass
  742. # try:
  743. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  744. # remove_list.append(num[2])
  745. # except IndexError:
  746. # pass
  747. ################################################### Email######################################################
  748. import re
  749. from email_scraper import scrape_emails
  750. s = list(scrape_emails(horizontaltext))
  751. email_id = s
  752. # email_id = []
  753. # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
  754. # for match in matches:
  755. # email_id.append(match)
  756. # # final.append('Email--' + match)
  757. # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
  758. # # final.append(email_)
  759. # # final.append('Email--' + email_)
  760. # # remove_list.append(email_)
  761. if len(email_id) > 1:
  762. final.append(
  763. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  764. ""))
  765. final.append(
  766. 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  767. "'",
  768. ""))
  769. else:
  770. try:
  771. final.append(
  772. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  773. "'",
  774. ""))
  775. final.append('OrganizationEmail--')
  776. except IndexError:
  777. final.append('ContactEmail--')
  778. final.append('OrganizationEmail--')
  779. ###############PINCODE############
  780. pinlst = []
  781. print(addrespinlst)
  782. import pgeocode
  783. # try:
  784. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  785. # for i in matche1:
  786. # address3 = i.replace(' ', '').replace('-', '')
  787. # pinlst.append(address3)
  788. # except IndexError:
  789. lst = []
  790. for i in num:
  791. i = i[1:]
  792. lst.append(i)
  793. infile = r"vtext.txt"
  794. outfile = r"cleaned_file.txt"
  795. import glob
  796. delete_list = lst
  797. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  798. fin = open(infile, "r+")
  799. fout = open(outfile, "w+")
  800. for line12 in fin:
  801. for word in delete_list:
  802. line12 = line12.replace(word, "")
  803. fout.write(line12)
  804. fin.close()
  805. # print(line)
  806. # print(addrespinlst)
  807. import pgeocode
  808. #print(line12)
  809. import re
  810. matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  811. for i in matche1:
  812. address3 = i.replace(' ', '').replace('-', '')
  813. pinlst.append(address3)
  814. nomi = pgeocode.Nominatim('IN')
  815. try:
  816. a = nomi.query_postal_code(str(pinlst[-1]))
  817. # print(a)
  818. b = a.keys()
  819. c = b.values.tolist()
  820. d = a.tolist()
  821. postal_code = "PinCode1" + "--" + d[0]
  822. final.append(postal_code)
  823. country_code = c[1] + "--" + str(d[1])
  824. final.append(country_code)
  825. place_name = 'LandMark1' + "--" + str(d[2])
  826. final.append(place_name)
  827. state_name = c[3] + "--" + str(d[3])
  828. final.append(state_name)
  829. state_code = c[4] + "--" + str(d[4])
  830. final.append(state_code)
  831. county_name = 'CityName1' + "--" + str(d[5])
  832. final.append(county_name)
  833. except (IndexError, NameError):
  834. final.append("PinCode1--")
  835. final.append("country_code--")
  836. final.append("LandMark1--")
  837. final.append("state_name--")
  838. final.append("state_code--")
  839. final.append("CityName1--")
  840. ######################################################## json #####################################################################
  841. import pandas as pd
  842. df = pd.DataFrame(final)
  843. df1 = df[0].str.split('--', expand=True)
  844. # print(df1)
  845. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  846. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  847. df1['Keys'] = df1['Keys'].str.strip()
  848. df1.to_csv('path123.csv', index=False)
  849. df2 = pd.read_csv('path123.csv')
  850. print(df2)
  851. df2 = df2.T
  852. df2.to_csv('path1.csv', index=False, header=False)
  853. df1 = pd.read_csv('path1.csv')
  854. df1.to_json('firstjson1.json', orient="index")
  855. import json
  856. with open('firstjson1.json', 'r') as json_file:
  857. json_load = json.load(json_file)
  858. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  859. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  860. # # print('--------------------------------------------------------------------------')
  861. # # print(nothing)
  862. empty = []
  863. import base64
  864. name = found
  865. image = open(name, 'rb')
  866. image_read = image.read()
  867. image_64_encode = base64.b64encode(image_read)
  868. NULL = 'null'
  869. empty.append("ByteData--" + (NULL).strip('""'))
  870. image_64_encode = image_64_encode.decode('utf-8')
  871. empty.append("FileData--" + str(image_64_encode))
  872. imagedata = name.split("/")
  873. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  874. imagename1 = str(imagename).split('.')
  875. imagename = str(imagename1[-2]).replace("[", "]")
  876. empty.append("FileName--" + imagename)
  877. empty.append("FilePath--"+ "")
  878. imageExtension = str(imagename1[-1]).replace("[", "]")
  879. empty.append("FileType--" + imageExtension)
  880. image.close()
  881. import pandas as pd
  882. df = pd.DataFrame(empty)
  883. df = df[0].str.split("--", expand=True)
  884. data1 = pd.DataFrame(df[0])
  885. data2 = pd.DataFrame(df[1])
  886. dt = data2.set_index(data1[0])
  887. dt4 = dt.T
  888. dictionary = dt4.to_dict(orient="index")
  889. list1 = []
  890. # list.append(a)
  891. list1.append(dictionary[1])
  892. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  893. print('--------------------')
  894. # print(namelist)
  895. import json
  896. # JSON data:
  897. x = nothing
  898. # python object to be appended
  899. y = {"image": dictionary[1]}
  900. # parsing JSON string:
  901. z = json.loads(x)
  902. # appending the data
  903. z.update(y)
  904. # the result is a JSON string:
  905. # print(json.dumps(z))
  906. zlist.append(z)
  907. #############################################creating csv#####################################
  908. #print(final)
  909. #print(imagelist)
  910. #final.append('image--' + str(imagelist))
  911. # import requests
  912. # import json
  913. # # url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
  914. # url = "https://qa.bizgaze.com/apis/v4/bizgaze/integrations/businesscards/create/list" #testing
  915. # # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
  916. # # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
  917. # # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  918. # payload1 = json.dumps(zlist)
  919. # # print('--------------------------------------------------------------------------')
  920. # #print(payload1)
  921. # headers = {
  922. # #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
  923. # # 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
  924. # # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
  925. # # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  926. # #'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
  927. # 'Authorization':'Stat e5bc6ad08f2c42feb5f98a2a521d00af',
  928. # 'Content-Type': 'application/json'
  929. # }
  930. # response = requests.request("POST", url, headers=headers, data=payload1)
  931. # # print("##############################################################")
  932. # print(payload1)
  933. # #print(zlist)
  934. # # import os
  935. # # if 'BusinessCards Created Successfully' in response.text:
  936. # # print('present')
  937. # # os.remove(found)
  938. # # else:
  939. # # print('not present')
  940. # df1.to_json('visitingcard.json')
  941. # data = df1.to_json('visiting.json', orient='records')
  942. # print(data)
  943. #return render_template('index.html')
  944. #return response.text
  945. #return z
  946. return zlist
  947. if __name__ == "__main__":
  948. app.run(host='0.0.0.0', port=1112)