Aucune description
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

Business_cards.py 43KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113
  1. from flask import Flask, render_template, request, redirect, Response, send_file
  2. import os
  3. import openai
  4. import requests
  5. import pandas as pd
  6. import pgeocode
  7. from email_scraper import scrape_emails
  8. import phonenumbers
  9. from pdfminer.high_level import extract_text
  10. import pytesseract
  11. import time
  12. import multiprocessing
  13. from PIL import Image
  14. from functools import partial
  15. from urlextract import URLExtract
  16. import pytesseract as tess
  17. from PIL import Image
  18. import os
  19. import glob
  20. from pytesseract import *
  21. import shutil
  22. import cv2
  23. import matplotlib
  24. from werkzeug.utils import secure_filename
  25. import requests
  26. import spacy
  27. import time
  28. import multiprocessing
  29. from PIL import Image
  30. from functools import partial
  31. import pandas as pd
  32. ################################################################
  33. Current_Working_Directory=os.getcwd()
  34. Current_Working_Directory=Current_Working_Directory.replace("\\","/")
  35. nlp_model1 = spacy.load(Current_Working_Directory + "/Invoice_parser/p")
  36. ################################################################
  37. # import spacy
  38. # nlp_model1 = spacy.load('./ADD3001.2')
  39. from flair.data import Sentence
  40. from flair.models import SequenceTagger
  41. from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
  42. tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  43. model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  44. from paddleocr import PaddleOCR, draw_ocr
  45. ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True)
  46. tagger = SequenceTagger.load("flair/ner-english-large")
  47. import datetime
  48. app = Flask(__name__)
  49. # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
  50. @app.route('/', methods=['GET'])
  51. def card():
  52. return render_template('card.html')
  53. @app.route('/upload_BusinessCards', methods=["POST"])
  54. # @app.route('/multiplecards', methods=["POST"])
  55. def multiplecards():
  56. # print('################## multiple card detection #######################')
  57. # print(Dataset)
  58. datalist=[]
  59. Dataset = request.get_json()
  60. # print(data)
  61. #datalist.append(Dataset)
  62. data = {'visiting': Dataset}
  63. for i in data['visiting']:
  64. import time
  65. # time.sleep(1)
  66. a = i
  67. x = a['FileData']
  68. # print(x)
  69. y = a['FileName']
  70. z = a['FileType']
  71. # CreatedBy=a['CreatedBy']
  72. name = y + '.' + z
  73. # print(name)
  74. # print(y)
  75. # image = y.split("/")
  76. # filename=image[-1]
  77. # print(x)
  78. img_data = x.encode()
  79. import base64
  80. with open('./multicards/' + name, "wb") as fh:
  81. fh.write(base64.decodebytes(img_data))
  82. # print(i)
  83. # import os
  84. # import glob
  85. # for i in glob.glob('./multipleupload/*'):
  86. found = './multicards/' + name
  87. print(found)
  88. extension = found.split('.')[-1]
  89. # for root, dirs, fils in os.glob('./multipleupload'):
  90. # for name in files:
  91. # foundfile= os.path.join(root, name)
  92. # print(foundfile)
  93. import re
  94. import csv
  95. import glob
  96. import os
  97. # import pytesseract
  98. # import cv2
  99. import numpy as np
  100. import glob
  101. import os
  102. import cv2
  103. import requests
  104. final = []
  105. # final.append('assignto--'+CreatedBy)
  106. imagelist = []
  107. # print(found)
  108. remove_list = []
  109. import os
  110. import glob
  111. import pdfminer
  112. # import os
  113. # ts = 0
  114. # for file_name in glob.glob('./upload/*'):
  115. # fts = os.path.getmtime(file_name)
  116. # if fts > ts:
  117. # ts = fts
  118. # found = file_name
  119. # print(found)
  120. # print(extension)
  121. def org_name():
  122. print('org_name is working')
  123. import pytesseract
  124. fname = found
  125. if extension != 'pdf':
  126. img = cv2.imread(fname)
  127. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  128. cv2.imwrite(str(found), img)
  129. from PIL import Image
  130. im = Image.open(found)
  131. im.save("images1.png", dpi=(1200, 1200))
  132. # import pytesseract
  133. fname = "images1.png"
  134. import pytesseract as tess
  135. from PIL import Image
  136. tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
  137. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  138. with open("demo.pdf", "w+b", ) as f:
  139. f.write(pdf)
  140. from pdfminer.high_level import extract_text
  141. text = extract_text('demo.pdf')
  142. # doc = DocumentFile.from_images(found)
  143. # result = model(doc)
  144. # text = result.render()
  145. # from pdfminer.high_level import extract_text
  146. # txt = extract_text('demo.pdf')
  147. else:
  148. from pdfminer.high_level import extract_text
  149. text = extract_text(fname)
  150. sentence = Sentence(text)
  151. # predict NER tags
  152. tagger.predict(sentence)
  153. # print sentence
  154. ko = (sentence)
  155. ko1 = str(ko).split("→")
  156. import pandas as pd
  157. dfg = []
  158. try:
  159. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  160. # os.remove(found)
  161. # return 'Invalid image'
  162. dfg.append(s)
  163. df = pd.DataFrame(dfg)
  164. df = df[0]
  165. df.to_csv("df.csv", index=False)
  166. df1 = pd.read_csv("df.csv")
  167. ve = df1["0"].str.split(",")
  168. fgf = ve.to_list()
  169. dfgh = pd.DataFrame(fgf[0])
  170. maindf = dfgh[0] # .str.split(":")
  171. # maindf.to_csv("main.csv")
  172. main1 = maindf.to_list()
  173. main1
  174. # cv=pd.DataFrame(ve)
  175. # cv
  176. per = ["PER"]
  177. org = ["ORG"]
  178. loc = ["LOC"]
  179. organizations = [i for i in main1 for j in org if j in i]
  180. PErsons = [i for i in main1 for j in per if j in i]
  181. location = [i for i in main1 for j in loc if j in i]
  182. except IndexError:
  183. pass
  184. # ************************************* ORGANIZATION ********************************************************************
  185. def organisation():
  186. print('organisation working ')
  187. try:
  188. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  189. '').replace(
  190. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  191. '').replace(
  192. '.com', ''))) < 4:
  193. pass
  194. else:
  195. match = str(urlfinal[0]).lower()
  196. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  197. 'https',
  198. '').replace(
  199. 'http', '').replace(":", "").replace("/", "").upper()
  200. print(match)
  201. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  202. '') + " /" + \
  203. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  204. s1 = s1g.upper()
  205. s2 = match.upper()
  206. from difflib import SequenceMatcher
  207. print(s1)
  208. print(s2)
  209. print(SequenceMatcher(None, s1, s2).ratio())
  210. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  211. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  212. final.append(
  213. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  214. '').replace(
  215. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  216. '').replace(
  217. '.com',
  218. '').replace(']', ''))
  219. else:
  220. final.append("OrganizationName--" + s2)
  221. except IndexError:
  222. try:
  223. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  224. '').replace(
  225. '"',
  226. '').replace(
  227. '.com', '').replace('.in', ''))) < 4:
  228. pass
  229. else:
  230. match = str(urlfinal[0]).lower()
  231. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  232. '').replace(
  233. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  234. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  235. s1 = s1g.upper()
  236. s2 = match.upper()
  237. from difflib import SequenceMatcher
  238. print(s1)
  239. print(s2)
  240. print(SequenceMatcher(None, s1, s2).ratio())
  241. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  242. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  243. final.append(
  244. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
  245. '[',
  246. '').replace(
  247. ']', '').replace(
  248. '.com', ''))
  249. else:
  250. final.append("OrganizationName--" + s2)
  251. except IndexError:
  252. try:
  253. match = str(urlfinal[0]).lower()
  254. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  255. '').upper()
  256. final.append("OrganizationName--" + match)
  257. # remove_list.append(match)
  258. except IndexError:
  259. company()
  260. #################################################company Name########################################
  261. def company():
  262. print('company list working')
  263. import re
  264. new = []
  265. with open('test.txt', 'r+') as f:
  266. flag = False
  267. for line in f:
  268. line = line.upper()
  269. matches = re.findall(
  270. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  271. line)
  272. for i in matches:
  273. if i in line:
  274. flag = True
  275. if flag:
  276. o = "OrganizationName--" + line
  277. new.append(o)
  278. # if line.startswith('\n'):
  279. # flag = False
  280. try:
  281. a = new[0].replace('\n', '')
  282. final.append(a)
  283. except IndexError:
  284. final.append("OrganizationName--")
  285. # ************************************* CONTACT PERSON *******************************************************************
  286. def contactpersonname():
  287. print('contactpersonname working')
  288. try:
  289. final.append(
  290. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
  291. "]",
  292. "") + '/' +
  293. PErsons[
  294. 1].replace(":PER", "").replace('"', ''))
  295. except IndexError:
  296. try:
  297. final.append(
  298. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
  299. "").replace(
  300. '"', ''))
  301. except IndexError:
  302. final.append("CONTACTPERSONNAME--")
  303. def image_to_text():
  304. # doc = DocumentFile.from_images(found)
  305. # result = model(doc)
  306. # image_to_text.txt = result.render()
  307. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  308. # img = Image.open(found)
  309. # text = tess.image_to_string(img)
  310. # image_to_text.txt = text
  311. # print(text)
  312. import cv2
  313. img_path = found
  314. img = cv2.imread(img_path)
  315. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  316. cv2.imwrite(str(found), img)
  317. result = ocr.ocr(img_path, cls=True)
  318. result = result[0]
  319. txts = [line[1][0] for line in result]
  320. image_to_text.txt = ""
  321. for i in txts:
  322. if len(i) < 4:
  323. continue
  324. # print(i+"\n")
  325. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  326. # print(image_to_text.txt)
  327. def pdf_to_text():
  328. from pdfminer.high_level import extract_text
  329. pdf_to_text.txt = extract_text(found)
  330. # pdf_to_text.txt= text.replace('\n', ' ')
  331. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  332. if extension in extensionlist:
  333. print('image' + extension)
  334. image_to_text()
  335. x = image_to_text.txt
  336. else:
  337. print('pdf' + extension)
  338. pdf_to_text()
  339. x = pdf_to_text.txt
  340. verticaltext = x
  341. htext = x
  342. # print('------------------------------------------------')
  343. print(
  344. '############################################################# this is verticaltext #################################################################')
  345. print(verticaltext)
  346. htext = htext.replace('\n', ' ')
  347. print(
  348. '############################################################# this is htext #############################################################')
  349. print(htext)
  350. y = x.replace('\n', ',')
  351. y = y.replace(' ', ' ')
  352. # y = y.replace(".", " .")
  353. horizontaltext = y
  354. # print('------------------------------------------------')
  355. print(
  356. '############################################################# this is horizontaltext #############################################################')
  357. print(horizontaltext)
  358. textfile = open("test123456.txt", "w")
  359. a = textfile.write(verticaltext)
  360. textfile.close()
  361. textfile = open("vtext.txt", "w")
  362. a = textfile.write(horizontaltext)
  363. textfile.close()
  364. with open('test123456.txt', 'r') as f:
  365. with open('test.txt', 'w') as w:
  366. for line in f:
  367. if line.strip().replace('|', ''):
  368. w.write(line)
  369. ###########################ADDRESS##################################
  370. addrespinlst = []
  371. def splitaddress():
  372. import re
  373. textaddress = htext.replace('\n', ' ')
  374. # print(textaddress)
  375. address1 = (textaddress.partition(",")[0])
  376. words = address1.split()
  377. address1 = words[-1]
  378. addre = (htext.partition(",")[2])
  379. a = addre.replace('\n', ' ').replace('\x0c', '')
  380. addre = (a.partition(",")[2])
  381. matches = re.findall(
  382. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  383. a)
  384. for match in matches:
  385. address2 = match
  386. address2 = str(address2)
  387. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
  388. '')
  389. matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  390. for address3 in matches:
  391. pass
  392. try:
  393. Address = address1 + "," + address2 + "," + address3
  394. final.append('ADDRESS--' + Address)
  395. addrespinlst.append(Address)
  396. except NameError:
  397. print(
  398. '############################################################ Addressmodelworking #############################################################')
  399. # doc = nlp_model1(textaddress)
  400. # addlist = []
  401. # for ent in doc.ents:
  402. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  403. # addlist.append(name)
  404. # try:
  405. # Address = addlist[0]
  406. # final.append(Address)
  407. # addrespinlst.append(Address)
  408. # remove_list.append(
  409. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  410. # "ADDRESS--",
  411. # ""))
  412. # except IndexError:
  413. # final.append("ADDRESS--")
  414. pass
  415. ################################################## website#######################################################
  416. # import re
  417. # url = []
  418. # matches = re.findall(r'www.*', verticaltext)
  419. # for match in matches:
  420. # if (match.count('.')) == 1:
  421. # a_string1 = match.replace("www", "www.")
  422. # final.append("Urls--" + a_string1)
  423. # url.append(a_string1)
  424. # else:
  425. # final.append("Urls--" + match)
  426. # if len(url)==0:
  427. # from urlextract import URLExtract
  428. # extractor = URLExtract()
  429. # urls = extractor.find_urls(verticaltext)
  430. # try:
  431. # urllist = urls[0]
  432. # final.append("Urls--"+urllist)
  433. # url.append(urllist)
  434. # except IndexError:
  435. # final.append("Urls--")
  436. # for match in matches:
  437. # if (match.count('.')) == 1:
  438. # a_string1 = match.replace("www", "www.")
  439. # final.append("Urls--" + a_string1)
  440. # url.append(a_string1)
  441. # else:
  442. # final.append("Urls--" + match)
  443. # url.append(match)
  444. # remove_list.append(match)
  445. # else:
  446. # final.append("Urls--" )
  447. ################################################## website#######################################################
  448. import re
  449. # final=[]
  450. url = []
  451. urlfinal = []
  452. matches = re.findall(r'www.*', verticaltext)
  453. for match in matches:
  454. if (match.count('.')) == 1:
  455. a_string1 = match.replace("www", "www.")
  456. # final.append("Urls--" + a_string1)
  457. url.append(a_string1)
  458. else:
  459. url.append(match)
  460. if len(url) == 0:
  461. from urlextract import URLExtract
  462. extractor = URLExtract()
  463. urls = extractor.find_urls(verticaltext)
  464. try:
  465. urllist = urls[0]
  466. url.append(urllist)
  467. url.append(urllist)
  468. except IndexError:
  469. pass
  470. for match in matches:
  471. if (match.count('.')) == 1:
  472. a_string1 = match.replace("www", "www.")
  473. url.append(a_string1)
  474. # url.append(a_string1)
  475. else:
  476. url.append(match)
  477. url.append(match)
  478. else:
  479. pass
  480. try:
  481. test_string = url[0]
  482. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  483. res = [ele for ele in test_list if (ele in test_string)]
  484. if len(res) == 0:
  485. print('no match')
  486. final.append('urls--')
  487. else:
  488. print('matched')
  489. final.append('urls--' + url[0])
  490. urlfinal.append(url[0])
  491. except IndexError:
  492. final.append('urls--')
  493. print(
  494. '############################################################# url #############################################################')
  495. print(url)
  496. #######organisation and contact################
  497. # def company_url():
  498. # # print('--url--')
  499. # # print(url)
  500. # try:
  501. # match = str(url[0]).lower()
  502. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  503. # final.append("OrganizationName--" + match)
  504. # # remove_list.append(match)
  505. # except IndexError:
  506. # org_name()
  507. # organisation()
  508. # final.append("OrganizationName--")
  509. # make example sentence
  510. # print(horizontaltext)
  511. sentence = Sentence(verticaltext)
  512. # predict NER tags
  513. tagger.predict(sentence)
  514. # print sentence
  515. ko = (sentence)
  516. ko1 = str(ko).split("→")
  517. import pandas as pd
  518. dfg = []
  519. try:
  520. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  521. except IndexError:
  522. os.remove(found)
  523. return 'Invalid image'
  524. dfg.append(s)
  525. df = pd.DataFrame(dfg)
  526. df = df[0]
  527. df.to_csv("df.csv", index=False)
  528. df1 = pd.read_csv("df.csv")
  529. ve = df1["0"].str.split(",")
  530. fgf = ve.to_list()
  531. dfgh = pd.DataFrame(fgf[0])
  532. maindf = dfgh[0] # .str.split(":")
  533. # maindf.to_csv("main.csv")
  534. main1 = maindf.to_list()
  535. main1
  536. # cv=pd.DataFrame(ve)
  537. # cv
  538. per = ["PER"]
  539. org = ["ORG"]
  540. loc = ["LOC"]
  541. organizations = [i for i in main1 for j in org if j in i]
  542. PErsons = [i for i in main1 for j in per if j in i]
  543. location = [i for i in main1 for j in loc if j in i]
  544. # ************************************* ORGANIZATION ********************************************************************
  545. try:
  546. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  547. '').replace(
  548. ']', '').replace(
  549. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  550. pass
  551. # company_url()
  552. else:
  553. match = str(urlfinal[0]).lower()
  554. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  555. 'https',
  556. '').replace(
  557. 'http', '').replace(":", "").replace("/", "").upper()
  558. print(match)
  559. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  560. '.com', '') + " /" + \
  561. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  562. s1 = s1g.upper()
  563. s2 = match.upper()
  564. from difflib import SequenceMatcher
  565. print(s1)
  566. print(s2)
  567. print(SequenceMatcher(None, s1, s2).ratio())
  568. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  569. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  570. final.append(
  571. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  572. '').replace(
  573. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  574. '').replace(
  575. '.com', '').replace(']', ''))
  576. else:
  577. final.append("OrganizationName--" + s2)
  578. except IndexError:
  579. try:
  580. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  581. '').replace(
  582. '"',
  583. '').replace(
  584. '.com', ''))) < 4:
  585. pass
  586. # company_url()
  587. else:
  588. match = str(urlfinal[0]).lower()
  589. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  590. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  591. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
  592. '').replace(
  593. '.com', '')
  594. s1 = s1g.upper()
  595. s2 = match.upper()
  596. from difflib import SequenceMatcher
  597. print(s1)
  598. print(s2)
  599. print(SequenceMatcher(None, s1, s2).ratio())
  600. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  601. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  602. final.append(
  603. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  604. '').replace(
  605. ']', '').replace(
  606. '.com', '').replace(']', ''))
  607. else:
  608. final.append("OrganizationName--" + s2)
  609. except IndexError:
  610. org_name()
  611. organisation()
  612. # final.append("OrganizationName--")
  613. # ************************************* CONTACT PERSON *******************************************************************
  614. try:
  615. final.append(
  616. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  617. "") +
  618. PErsons[
  619. 1].replace(":PER", "").replace('"', ''))
  620. except IndexError:
  621. try:
  622. final.append(
  623. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  624. '"',
  625. ''))
  626. except IndexError:
  627. org_name()
  628. contactpersonname()
  629. # final.append("CONTACTPERSONNAME--")
  630. ###############address flair#####################
  631. try:
  632. print(
  633. '############################################################# address new code #############################################################')
  634. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  635. loclst = [i for i in loactionlst if i in htext.lower()]
  636. textaddress = htext
  637. textaddress = textaddress.replace("|", ",")
  638. textaddress = textaddress.lower()
  639. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  640. grop = nlp(textaddress)
  641. citycountry = []
  642. print('########################### city or country name ###########################')
  643. d = grop[-1]
  644. if d['entity_group'] == "COUNTRY":
  645. print(d["word"])
  646. citycountry.append(d["word"])
  647. elif d['entity_group'] == "CITY":
  648. print(d["word"])
  649. citycountry.append(d["word"])
  650. try:
  651. address1 = loclst[0]
  652. except IndexError:
  653. address1 = (textaddress.partition(",")[0])
  654. words = address1.split()
  655. address1 = words[-1]
  656. star_location = address1.lower()
  657. end_location = citycountry[0].replace("#", "")
  658. start = star_location
  659. end = end_location
  660. s = textaddress.lower()
  661. middle_address = (s.split(start))[-1].split(end)[0]
  662. Address = start + middle_address + end
  663. Address = Address.replace('--', '').title()
  664. print(Address)
  665. if Address.count(',') < 2:
  666. splitaddress()
  667. else:
  668. final.append('ADDRESS--' + Address)
  669. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  670. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  671. # d1 = star_location.split()
  672. # d2 = end_location.split()
  673. # d3 = d1[0]
  674. # d4 = d2[0]
  675. # start = d3
  676. # end = d4
  677. # s = horizontaltext
  678. # middle_address = ((s.split(start))[1].split(end)[0])
  679. # Address = d3 + middle_address + d4
  680. # final.append('ADDRESS--' + Address)
  681. # addrespinlst.append(Address)
  682. except IndexError:
  683. splitaddress()
  684. ########################################## Designation ###########################################
  685. import re
  686. new = []
  687. with open('test.txt', 'r') as f:
  688. flag = False
  689. for line in f:
  690. line1 = line
  691. line = line.upper()
  692. matches = re.findall(
  693. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  694. line)
  695. for match in matches:
  696. line = line.replace('-', '')
  697. # print(line)
  698. o = "Designation--" + line
  699. new.append(o)
  700. remove_list.append(str(line1).replace('\n', ''))
  701. try:
  702. a = new[0].replace('\n', '')
  703. final.append(a)
  704. except IndexError:
  705. final.append("Designation--")
  706. ###################################################Phone number#################################################
  707. num = []
  708. import phonenumbers
  709. # print(verticaltext)
  710. numbers = phonenumbers.PhoneNumberMatcher(
  711. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
  712. for number in numbers:
  713. number = str(number).split(")")
  714. num.append(number[1])
  715. # num.append(number[-1])
  716. if len(num) == 0:
  717. final.append("ContactNumber--")
  718. final.append("OrganizationNumber--")
  719. elif len(num) > 1:
  720. final.append("ContactNumber--" + num[0].replace(' ', ''))
  721. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  722. elif len(num) == 1:
  723. try:
  724. final.append("ContactNumber--" + num[0].replace(' ', ''))
  725. final.append("OrganizationNumber--")
  726. except IndexError:
  727. final.append("ContactNumber--")
  728. final.append("OrganizationNumber--")
  729. print(
  730. '############################################################# num #############################################################')
  731. print(num)
  732. # try:
  733. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  734. # remove_list.append(num[0])
  735. # except IndexError:
  736. # pass
  737. # try:
  738. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  739. # remove_list.append(num[1])
  740. # except IndexError:
  741. # pass
  742. # try:
  743. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  744. # remove_list.append(num[2])
  745. # except IndexError:
  746. # pass
  747. ################################################### Email######################################################
  748. import re
  749. from email_scraper import scrape_emails
  750. s = list(scrape_emails(horizontaltext))
  751. email_id = s
  752. # email_id = []
  753. # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
  754. # for match in matches:
  755. # email_id.append(match)
  756. # # final.append('Email--' + match)
  757. # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
  758. # # final.append(email_)
  759. # # final.append('Email--' + email_)
  760. # # remove_list.append(email_)
  761. if len(email_id) > 1:
  762. final.append(
  763. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  764. ""))
  765. final.append(
  766. 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  767. "'",
  768. ""))
  769. else:
  770. try:
  771. final.append(
  772. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  773. "'",
  774. ""))
  775. final.append('OrganizationEmail--')
  776. except IndexError:
  777. final.append('ContactEmail--')
  778. final.append('OrganizationEmail--')
  779. ###############PINCODE############
  780. pinlst = []
  781. print(addrespinlst)
  782. import pgeocode
  783. # try:
  784. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  785. # for i in matche1:
  786. # address3 = i.replace(' ', '').replace('-', '')
  787. # pinlst.append(address3)
  788. # except IndexError:
  789. lst = []
  790. for i in num:
  791. i = i[1:]
  792. lst.append(i)
  793. infile = r"vtext.txt"
  794. outfile = r"cleaned_file.txt"
  795. import glob
  796. delete_list = lst
  797. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  798. fin = open(infile, "r+")
  799. fout = open(outfile, "w+")
  800. for line12 in fin:
  801. for word in delete_list:
  802. line12 = line12.replace(word, "")
  803. fout.write(line12)
  804. fin.close()
  805. # print(line)
  806. # print(addrespinlst)
  807. import pgeocode
  808. print(line12)
  809. import re
  810. matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  811. for i in matche1:
  812. address3 = i.replace(' ', '').replace('-', '')
  813. pinlst.append(address3)
  814. nomi = pgeocode.Nominatim('IN')
  815. try:
  816. a = nomi.query_postal_code(str(pinlst[-1]))
  817. # print(a)
  818. b = a.keys()
  819. c = b.values.tolist()
  820. d = a.tolist()
  821. postal_code = "PinCode1" + "--" + d[0]
  822. final.append(postal_code)
  823. country_code = c[1] + "--" + str(d[1])
  824. final.append(country_code)
  825. place_name = 'LandMark1' + "--" + str(d[2])
  826. final.append(place_name)
  827. state_name = c[3] + "--" + str(d[3])
  828. final.append(state_name)
  829. state_code = c[4] + "--" + str(d[4])
  830. final.append(state_code)
  831. county_name = 'CityName1' + "--" + str(d[5])
  832. final.append(county_name)
  833. except (IndexError, NameError):
  834. final.append("PinCode1--")
  835. final.append("country_code--")
  836. final.append("LandMark1--")
  837. final.append("state_name--")
  838. final.append("state_code--")
  839. final.append("CityName1--")
  840. ######################################################## json #####################################################################
  841. import pandas as pd
  842. df = pd.DataFrame(final)
  843. df1 = df[0].str.split('--', expand=True)
  844. # print(df1)
  845. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  846. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  847. df1['Keys'] = df1['Keys'].str.strip()
  848. df1.to_csv('path123.csv', index=False)
  849. df2 = pd.read_csv('path123.csv')
  850. print(df2)
  851. df2 = df2.T
  852. df2.to_csv('path1.csv', index=False, header=False)
  853. df1 = pd.read_csv('path1.csv')
  854. df1.to_json('firstjson1.json', orient="index")
  855. import json
  856. with open('firstjson1.json', 'r') as json_file:
  857. json_load = json.load(json_file)
  858. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  859. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  860. # # print('--------------------------------------------------------------------------')
  861. # # print(nothing)
  862. empty = []
  863. import base64
  864. name = found
  865. image = open(name, 'rb')
  866. image_read = image.read()
  867. image_64_encode = base64.b64encode(image_read)
  868. NULL = 'null'
  869. empty.append("ByteData--" + (NULL).strip('""'))
  870. image_64_encode = image_64_encode.decode('utf-8')
  871. empty.append("FileData--" + str(image_64_encode))
  872. imagedata = name.split("/")
  873. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  874. imagename1 = str(imagename).split('.')
  875. imagename = str(imagename1[-2]).replace("[", "]")
  876. empty.append("FileName--" + imagename)
  877. empty.append("FilePath--"+ "")
  878. imageExtension = str(imagename1[-1]).replace("[", "]")
  879. empty.append("FileType--" + imageExtension)
  880. image.close()
  881. import pandas as pd
  882. df = pd.DataFrame(empty)
  883. df = df[0].str.split("--", expand=True)
  884. data1 = pd.DataFrame(df[0])
  885. data2 = pd.DataFrame(df[1])
  886. dt = data2.set_index(data1[0])
  887. dt4 = dt.T
  888. dictionary = dt4.to_dict(orient="index")
  889. list1 = []
  890. # list.append(a)
  891. list1.append(dictionary[1])
  892. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  893. print('--------------------')
  894. # print(namelist)
  895. import json
  896. # JSON data:
  897. x = nothing
  898. # python object to be appended
  899. y = {"image": dictionary[1]}
  900. # parsing JSON string:
  901. z = json.loads(x)
  902. # appending the data
  903. z.update(y)
  904. # the result is a JSON string:
  905. # print(json.dumps(z))
  906. zlist=[]
  907. zlist.append(z)
  908. #############################################creating csv#####################################
  909. print(final)
  910. print(imagelist)
  911. final.append('image--' + str(imagelist))
  912. import requests
  913. import json
  914. url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
  915. # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" #testing
  916. # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
  917. # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
  918. # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  919. payload1 = json.dumps(zlist)
  920. # print('--------------------------------------------------------------------------')
  921. #print(payload1)
  922. headers = {
  923. #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
  924. 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
  925. # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
  926. # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  927. # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
  928. 'Content-Type': 'application/json'
  929. }
  930. response = requests.request("POST", url, headers=headers, data=payload1)
  931. # print("##############################################################")
  932. #print(payload1)
  933. print(response.text)
  934. import os
  935. if 'BusinessCards Created Successfully' in response.text:
  936. print('present')
  937. os.remove(found)
  938. else:
  939. print('not present')
  940. df1.to_json('visitingcard.json')
  941. data = df1.to_json('visiting.json', orient='records')
  942. print(data)
  943. #return render_template('index.html')
  944. return response.text
  945. # return 'done'
  946. if __name__ == "__main__":
  947. app.run(host='0.0.0.0', port=1112)