Нет описания
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

Business_cards.py 42KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048
  1. @app.route('/upload_BusinessCards', methods=["POST"])
  2. # @app.route('/multiplecards', methods=["POST"])
  3. def multiplecards():
  4. # print('################## multiple card detection #######################')
  5. # print(Dataset)
  6. datalist=[]
  7. Dataset = request.get_json()
  8. # print(data)
  9. #datalist.append(Dataset)
  10. data = {'visiting': Dataset}
  11. for i in data['visiting']:
  12. import time
  13. # time.sleep(1)
  14. a = i
  15. x = a['FileData']
  16. # print(x)
  17. y = a['FileName']
  18. z = a['FileType']
  19. # CreatedBy=a['CreatedBy']
  20. name = y + '.' + z
  21. # print(name)
  22. # print(y)
  23. # image = y.split("/")
  24. # filename=image[-1]
  25. # print(x)
  26. img_data = x.encode()
  27. import base64
  28. with open('./multicards/' + name, "wb") as fh:
  29. fh.write(base64.decodebytes(img_data))
  30. # print(i)
  31. # import os
  32. # import glob
  33. # for i in glob.glob('./multipleupload/*'):
  34. found = './multicards/' + name
  35. print(found)
  36. extension = found.split('.')[-1]
  37. # for root, dirs, fils in os.glob('./multipleupload'):
  38. # for name in files:
  39. # foundfile= os.path.join(root, name)
  40. # print(foundfile)
  41. import re
  42. import csv
  43. import glob
  44. import os
  45. # import pytesseract
  46. # import cv2
  47. import numpy as np
  48. import glob
  49. import os
  50. import cv2
  51. import requests
  52. final = []
  53. # final.append('assignto--'+CreatedBy)
  54. imagelist = []
  55. # print(found)
  56. remove_list = []
  57. import os
  58. import glob
  59. import pdfminer
  60. # import os
  61. # ts = 0
  62. # for file_name in glob.glob('./upload/*'):
  63. # fts = os.path.getmtime(file_name)
  64. # if fts > ts:
  65. # ts = fts
  66. # found = file_name
  67. # print(found)
  68. # print(extension)
  69. def org_name():
  70. print('org_name is working')
  71. import pytesseract
  72. fname = found
  73. if extension != 'pdf':
  74. img = cv2.imread(fname)
  75. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  76. cv2.imwrite(str(found), img)
  77. from PIL import Image
  78. im = Image.open(found)
  79. im.save("images1.png", dpi=(1200, 1200))
  80. # import pytesseract
  81. fname = "images1.png"
  82. import pytesseract as tess
  83. from PIL import Image
  84. tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
  85. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  86. with open("demo.pdf", "w+b", ) as f:
  87. f.write(pdf)
  88. from pdfminer.high_level import extract_text
  89. text = extract_text('demo.pdf')
  90. # doc = DocumentFile.from_images(found)
  91. # result = model(doc)
  92. # text = result.render()
  93. # from pdfminer.high_level import extract_text
  94. # txt = extract_text('demo.pdf')
  95. else:
  96. from pdfminer.high_level import extract_text
  97. text = extract_text(fname)
  98. sentence = Sentence(text)
  99. # predict NER tags
  100. tagger.predict(sentence)
  101. # print sentence
  102. ko = (sentence)
  103. ko1 = str(ko).split("→")
  104. import pandas as pd
  105. dfg = []
  106. try:
  107. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  108. # os.remove(found)
  109. # return 'Invalid image'
  110. dfg.append(s)
  111. df = pd.DataFrame(dfg)
  112. df = df[0]
  113. df.to_csv("df.csv", index=False)
  114. df1 = pd.read_csv("df.csv")
  115. ve = df1["0"].str.split(",")
  116. fgf = ve.to_list()
  117. dfgh = pd.DataFrame(fgf[0])
  118. maindf = dfgh[0] # .str.split(":")
  119. # maindf.to_csv("main.csv")
  120. main1 = maindf.to_list()
  121. main1
  122. # cv=pd.DataFrame(ve)
  123. # cv
  124. per = ["PER"]
  125. org = ["ORG"]
  126. loc = ["LOC"]
  127. organizations = [i for i in main1 for j in org if j in i]
  128. PErsons = [i for i in main1 for j in per if j in i]
  129. location = [i for i in main1 for j in loc if j in i]
  130. except IndexError:
  131. pass
  132. # ************************************* ORGANIZATION ********************************************************************
  133. def organisation():
  134. print('organisation working ')
  135. try:
  136. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  137. '').replace(
  138. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  139. '').replace(
  140. '.com', ''))) < 4:
  141. pass
  142. else:
  143. match = str(urlfinal[0]).lower()
  144. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  145. 'https',
  146. '').replace(
  147. 'http', '').replace(":", "").replace("/", "").upper()
  148. print(match)
  149. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  150. '') + " /" + \
  151. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  152. s1 = s1g.upper()
  153. s2 = match.upper()
  154. from difflib import SequenceMatcher
  155. print(s1)
  156. print(s2)
  157. print(SequenceMatcher(None, s1, s2).ratio())
  158. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  159. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  160. final.append(
  161. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  162. '').replace(
  163. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  164. '').replace(
  165. '.com',
  166. '').replace(']', ''))
  167. else:
  168. final.append("OrganizationName--" + s2)
  169. except IndexError:
  170. try:
  171. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  172. '').replace(
  173. '"',
  174. '').replace(
  175. '.com', '').replace('.in', ''))) < 4:
  176. pass
  177. else:
  178. match = str(urlfinal[0]).lower()
  179. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  180. '').replace(
  181. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  182. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  183. s1 = s1g.upper()
  184. s2 = match.upper()
  185. from difflib import SequenceMatcher
  186. print(s1)
  187. print(s2)
  188. print(SequenceMatcher(None, s1, s2).ratio())
  189. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  190. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  191. final.append(
  192. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
  193. '[',
  194. '').replace(
  195. ']', '').replace(
  196. '.com', ''))
  197. else:
  198. final.append("OrganizationName--" + s2)
  199. except IndexError:
  200. try:
  201. match = str(urlfinal[0]).lower()
  202. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  203. '').upper()
  204. final.append("OrganizationName--" + match)
  205. # remove_list.append(match)
  206. except IndexError:
  207. company()
  208. #################################################company Name########################################
  209. def company():
  210. print('company list working')
  211. import re
  212. new = []
  213. with open('test.txt', 'r+') as f:
  214. flag = False
  215. for line in f:
  216. line = line.upper()
  217. matches = re.findall(
  218. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  219. line)
  220. for i in matches:
  221. if i in line:
  222. flag = True
  223. if flag:
  224. o = "OrganizationName--" + line
  225. new.append(o)
  226. # if line.startswith('\n'):
  227. # flag = False
  228. try:
  229. a = new[0].replace('\n', '')
  230. final.append(a)
  231. except IndexError:
  232. final.append("OrganizationName--")
  233. # ************************************* CONTACT PERSON *******************************************************************
  234. def contactpersonname():
  235. print('contactpersonname working')
  236. try:
  237. final.append(
  238. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
  239. "]",
  240. "") + '/' +
  241. PErsons[
  242. 1].replace(":PER", "").replace('"', ''))
  243. except IndexError:
  244. try:
  245. final.append(
  246. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
  247. "").replace(
  248. '"', ''))
  249. except IndexError:
  250. final.append("CONTACTPERSONNAME--")
  251. def image_to_text():
  252. # doc = DocumentFile.from_images(found)
  253. # result = model(doc)
  254. # image_to_text.txt = result.render()
  255. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  256. # img = Image.open(found)
  257. # text = tess.image_to_string(img)
  258. # image_to_text.txt = text
  259. # print(text)
  260. import cv2
  261. img_path = found
  262. img = cv2.imread(img_path)
  263. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  264. cv2.imwrite(str(found), img)
  265. result = ocr.ocr(img_path, cls=True)
  266. result = result[0]
  267. txts = [line[1][0] for line in result]
  268. image_to_text.txt = ""
  269. for i in txts:
  270. if len(i) < 4:
  271. continue
  272. # print(i+"\n")
  273. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  274. # print(image_to_text.txt)
  275. def pdf_to_text():
  276. from pdfminer.high_level import extract_text
  277. pdf_to_text.txt = extract_text(found)
  278. # pdf_to_text.txt= text.replace('\n', ' ')
  279. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  280. if extension in extensionlist:
  281. print('image' + extension)
  282. image_to_text()
  283. x = image_to_text.txt
  284. else:
  285. print('pdf' + extension)
  286. pdf_to_text()
  287. x = pdf_to_text.txt
  288. verticaltext = x
  289. htext = x
  290. # print('------------------------------------------------')
  291. print(
  292. '############################################################# this is verticaltext #################################################################')
  293. print(verticaltext)
  294. htext = htext.replace('\n', ' ')
  295. print(
  296. '############################################################# this is htext #############################################################')
  297. print(htext)
  298. y = x.replace('\n', ',')
  299. y = y.replace(' ', ' ')
  300. # y = y.replace(".", " .")
  301. horizontaltext = y
  302. # print('------------------------------------------------')
  303. print(
  304. '############################################################# this is horizontaltext #############################################################')
  305. print(horizontaltext)
  306. textfile = open("test123456.txt", "w")
  307. a = textfile.write(verticaltext)
  308. textfile.close()
  309. textfile = open("vtext.txt", "w")
  310. a = textfile.write(horizontaltext)
  311. textfile.close()
  312. with open('test123456.txt', 'r') as f:
  313. with open('test.txt', 'w') as w:
  314. for line in f:
  315. if line.strip().replace('|', ''):
  316. w.write(line)
  317. ###########################ADDRESS##################################
  318. addrespinlst = []
  319. def splitaddress():
  320. import re
  321. textaddress = htext.replace('\n', ' ')
  322. # print(textaddress)
  323. address1 = (textaddress.partition(",")[0])
  324. words = address1.split()
  325. address1 = words[-1]
  326. addre = (htext.partition(",")[2])
  327. a = addre.replace('\n', ' ').replace('\x0c', '')
  328. addre = (a.partition(",")[2])
  329. matches = re.findall(
  330. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  331. a)
  332. for match in matches:
  333. address2 = match
  334. address2 = str(address2)
  335. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
  336. '')
  337. matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  338. for address3 in matches:
  339. pass
  340. try:
  341. Address = address1 + "," + address2 + "," + address3
  342. final.append('ADDRESS--' + Address)
  343. addrespinlst.append(Address)
  344. except NameError:
  345. print(
  346. '############################################################ Addressmodelworking #############################################################')
  347. # doc = nlp_model1(textaddress)
  348. # addlist = []
  349. # for ent in doc.ents:
  350. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  351. # addlist.append(name)
  352. # try:
  353. # Address = addlist[0]
  354. # final.append(Address)
  355. # addrespinlst.append(Address)
  356. # remove_list.append(
  357. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  358. # "ADDRESS--",
  359. # ""))
  360. # except IndexError:
  361. # final.append("ADDRESS--")
  362. pass
  363. ################################################## website#######################################################
  364. # import re
  365. # url = []
  366. # matches = re.findall(r'www.*', verticaltext)
  367. # for match in matches:
  368. # if (match.count('.')) == 1:
  369. # a_string1 = match.replace("www", "www.")
  370. # final.append("Urls--" + a_string1)
  371. # url.append(a_string1)
  372. # else:
  373. # final.append("Urls--" + match)
  374. # if len(url)==0:
  375. # from urlextract import URLExtract
  376. # extractor = URLExtract()
  377. # urls = extractor.find_urls(verticaltext)
  378. # try:
  379. # urllist = urls[0]
  380. # final.append("Urls--"+urllist)
  381. # url.append(urllist)
  382. # except IndexError:
  383. # final.append("Urls--")
  384. # for match in matches:
  385. # if (match.count('.')) == 1:
  386. # a_string1 = match.replace("www", "www.")
  387. # final.append("Urls--" + a_string1)
  388. # url.append(a_string1)
  389. # else:
  390. # final.append("Urls--" + match)
  391. # url.append(match)
  392. # remove_list.append(match)
  393. # else:
  394. # final.append("Urls--" )
  395. ################################################## website#######################################################
  396. import re
  397. # final=[]
  398. url = []
  399. urlfinal = []
  400. matches = re.findall(r'www.*', verticaltext)
  401. for match in matches:
  402. if (match.count('.')) == 1:
  403. a_string1 = match.replace("www", "www.")
  404. # final.append("Urls--" + a_string1)
  405. url.append(a_string1)
  406. else:
  407. url.append(match)
  408. if len(url) == 0:
  409. from urlextract import URLExtract
  410. extractor = URLExtract()
  411. urls = extractor.find_urls(verticaltext)
  412. try:
  413. urllist = urls[0]
  414. url.append(urllist)
  415. url.append(urllist)
  416. except IndexError:
  417. pass
  418. for match in matches:
  419. if (match.count('.')) == 1:
  420. a_string1 = match.replace("www", "www.")
  421. url.append(a_string1)
  422. # url.append(a_string1)
  423. else:
  424. url.append(match)
  425. url.append(match)
  426. else:
  427. pass
  428. try:
  429. test_string = url[0]
  430. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  431. res = [ele for ele in test_list if (ele in test_string)]
  432. if len(res) == 0:
  433. print('no match')
  434. final.append('urls--')
  435. else:
  436. print('matched')
  437. final.append('urls--' + url[0])
  438. urlfinal.append(url[0])
  439. except IndexError:
  440. final.append('urls--')
  441. print(
  442. '############################################################# url #############################################################')
  443. print(url)
  444. #######organisation and contact################
  445. # def company_url():
  446. # # print('--url--')
  447. # # print(url)
  448. # try:
  449. # match = str(url[0]).lower()
  450. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  451. # final.append("OrganizationName--" + match)
  452. # # remove_list.append(match)
  453. # except IndexError:
  454. # org_name()
  455. # organisation()
  456. # final.append("OrganizationName--")
  457. # make example sentence
  458. # print(horizontaltext)
  459. sentence = Sentence(verticaltext)
  460. # predict NER tags
  461. tagger.predict(sentence)
  462. # print sentence
  463. ko = (sentence)
  464. ko1 = str(ko).split("→")
  465. import pandas as pd
  466. dfg = []
  467. try:
  468. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  469. except IndexError:
  470. os.remove(found)
  471. return 'Invalid image'
  472. dfg.append(s)
  473. df = pd.DataFrame(dfg)
  474. df = df[0]
  475. df.to_csv("df.csv", index=False)
  476. df1 = pd.read_csv("df.csv")
  477. ve = df1["0"].str.split(",")
  478. fgf = ve.to_list()
  479. dfgh = pd.DataFrame(fgf[0])
  480. maindf = dfgh[0] # .str.split(":")
  481. # maindf.to_csv("main.csv")
  482. main1 = maindf.to_list()
  483. main1
  484. # cv=pd.DataFrame(ve)
  485. # cv
  486. per = ["PER"]
  487. org = ["ORG"]
  488. loc = ["LOC"]
  489. organizations = [i for i in main1 for j in org if j in i]
  490. PErsons = [i for i in main1 for j in per if j in i]
  491. location = [i for i in main1 for j in loc if j in i]
  492. # ************************************* ORGANIZATION ********************************************************************
  493. try:
  494. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  495. '').replace(
  496. ']', '').replace(
  497. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  498. pass
  499. # company_url()
  500. else:
  501. match = str(urlfinal[0]).lower()
  502. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  503. 'https',
  504. '').replace(
  505. 'http', '').replace(":", "").replace("/", "").upper()
  506. print(match)
  507. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  508. '.com', '') + " /" + \
  509. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  510. s1 = s1g.upper()
  511. s2 = match.upper()
  512. from difflib import SequenceMatcher
  513. print(s1)
  514. print(s2)
  515. print(SequenceMatcher(None, s1, s2).ratio())
  516. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  517. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  518. final.append(
  519. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  520. '').replace(
  521. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  522. '').replace(
  523. '.com', '').replace(']', ''))
  524. else:
  525. final.append("OrganizationName--" + s2)
  526. except IndexError:
  527. try:
  528. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  529. '').replace(
  530. '"',
  531. '').replace(
  532. '.com', ''))) < 4:
  533. pass
  534. # company_url()
  535. else:
  536. match = str(urlfinal[0]).lower()
  537. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  538. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  539. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
  540. '').replace(
  541. '.com', '')
  542. s1 = s1g.upper()
  543. s2 = match.upper()
  544. from difflib import SequenceMatcher
  545. print(s1)
  546. print(s2)
  547. print(SequenceMatcher(None, s1, s2).ratio())
  548. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  549. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  550. final.append(
  551. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  552. '').replace(
  553. ']', '').replace(
  554. '.com', '').replace(']', ''))
  555. else:
  556. final.append("OrganizationName--" + s2)
  557. except IndexError:
  558. org_name()
  559. organisation()
  560. # final.append("OrganizationName--")
  561. # ************************************* CONTACT PERSON *******************************************************************
  562. try:
  563. final.append(
  564. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  565. "") +
  566. PErsons[
  567. 1].replace(":PER", "").replace('"', ''))
  568. except IndexError:
  569. try:
  570. final.append(
  571. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  572. '"',
  573. ''))
  574. except IndexError:
  575. org_name()
  576. contactpersonname()
  577. # final.append("CONTACTPERSONNAME--")
  578. ###############address flair#####################
  579. try:
  580. print(
  581. '############################################################# address new code #############################################################')
  582. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  583. loclst = [i for i in loactionlst if i in htext.lower()]
  584. textaddress = htext
  585. textaddress = textaddress.replace("|", ",")
  586. textaddress = textaddress.lower()
  587. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  588. grop = nlp(textaddress)
  589. citycountry = []
  590. print('########################### city or country name ###########################')
  591. d = grop[-1]
  592. if d['entity_group'] == "COUNTRY":
  593. print(d["word"])
  594. citycountry.append(d["word"])
  595. elif d['entity_group'] == "CITY":
  596. print(d["word"])
  597. citycountry.append(d["word"])
  598. try:
  599. address1 = loclst[0]
  600. except IndexError:
  601. address1 = (textaddress.partition(",")[0])
  602. words = address1.split()
  603. address1 = words[-1]
  604. star_location = address1.lower()
  605. end_location = citycountry[0].replace("#", "")
  606. start = star_location
  607. end = end_location
  608. s = textaddress.lower()
  609. middle_address = (s.split(start))[-1].split(end)[0]
  610. Address = start + middle_address + end
  611. Address = Address.replace('--', '').title()
  612. print(Address)
  613. if Address.count(',') < 2:
  614. splitaddress()
  615. else:
  616. final.append('ADDRESS--' + Address)
  617. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  618. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  619. # d1 = star_location.split()
  620. # d2 = end_location.split()
  621. # d3 = d1[0]
  622. # d4 = d2[0]
  623. # start = d3
  624. # end = d4
  625. # s = horizontaltext
  626. # middle_address = ((s.split(start))[1].split(end)[0])
  627. # Address = d3 + middle_address + d4
  628. # final.append('ADDRESS--' + Address)
  629. # addrespinlst.append(Address)
  630. except IndexError:
  631. splitaddress()
  632. ########################################## Designation ###########################################
  633. import re
  634. new = []
  635. with open('test.txt', 'r') as f:
  636. flag = False
  637. for line in f:
  638. line1 = line
  639. line = line.upper()
  640. matches = re.findall(
  641. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  642. line)
  643. for match in matches:
  644. line = line.replace('-', '')
  645. # print(line)
  646. o = "Designation--" + line
  647. new.append(o)
  648. remove_list.append(str(line1).replace('\n', ''))
  649. try:
  650. a = new[0].replace('\n', '')
  651. final.append(a)
  652. except IndexError:
  653. final.append("Designation--")
  654. ###################################################Phone number#################################################
  655. num = []
  656. import phonenumbers
  657. # print(verticaltext)
  658. numbers = phonenumbers.PhoneNumberMatcher(
  659. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
  660. for number in numbers:
  661. number = str(number).split(")")
  662. num.append(number[1])
  663. # num.append(number[-1])
  664. if len(num) == 0:
  665. final.append("ContactNumber--")
  666. final.append("OrganizationNumber--")
  667. elif len(num) > 1:
  668. final.append("ContactNumber--" + num[0].replace(' ', ''))
  669. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  670. elif len(num) == 1:
  671. try:
  672. final.append("ContactNumber--" + num[0].replace(' ', ''))
  673. final.append("OrganizationNumber--")
  674. except IndexError:
  675. final.append("ContactNumber--")
  676. final.append("OrganizationNumber--")
  677. print(
  678. '############################################################# num #############################################################')
  679. print(num)
  680. # try:
  681. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  682. # remove_list.append(num[0])
  683. # except IndexError:
  684. # pass
  685. # try:
  686. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  687. # remove_list.append(num[1])
  688. # except IndexError:
  689. # pass
  690. # try:
  691. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  692. # remove_list.append(num[2])
  693. # except IndexError:
  694. # pass
  695. ################################################### Email######################################################
  696. import re
  697. from email_scraper import scrape_emails
  698. s = list(scrape_emails(horizontaltext))
  699. email_id = s
  700. # email_id = []
  701. # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
  702. # for match in matches:
  703. # email_id.append(match)
  704. # # final.append('Email--' + match)
  705. # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
  706. # # final.append(email_)
  707. # # final.append('Email--' + email_)
  708. # # remove_list.append(email_)
  709. if len(email_id) > 1:
  710. final.append(
  711. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  712. ""))
  713. final.append(
  714. 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  715. "'",
  716. ""))
  717. else:
  718. try:
  719. final.append(
  720. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  721. "'",
  722. ""))
  723. final.append('OrganizationEmail--')
  724. except IndexError:
  725. final.append('ContactEmail--')
  726. final.append('OrganizationEmail--')
  727. ###############PINCODE############
  728. pinlst = []
  729. print(addrespinlst)
  730. import pgeocode
  731. # try:
  732. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  733. # for i in matche1:
  734. # address3 = i.replace(' ', '').replace('-', '')
  735. # pinlst.append(address3)
  736. # except IndexError:
  737. lst = []
  738. for i in num:
  739. i = i[1:]
  740. lst.append(i)
  741. infile = r"vtext.txt"
  742. outfile = r"cleaned_file.txt"
  743. import glob
  744. delete_list = lst
  745. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  746. fin = open(infile, "r+")
  747. fout = open(outfile, "w+")
  748. for line12 in fin:
  749. for word in delete_list:
  750. line12 = line12.replace(word, "")
  751. fout.write(line12)
  752. fin.close()
  753. # print(line)
  754. # print(addrespinlst)
  755. import pgeocode
  756. print(line12)
  757. import re
  758. matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  759. for i in matche1:
  760. address3 = i.replace(' ', '').replace('-', '')
  761. pinlst.append(address3)
  762. nomi = pgeocode.Nominatim('IN')
  763. try:
  764. a = nomi.query_postal_code(str(pinlst[-1]))
  765. # print(a)
  766. b = a.keys()
  767. c = b.values.tolist()
  768. d = a.tolist()
  769. postal_code = "PinCode1" + "--" + d[0]
  770. final.append(postal_code)
  771. country_code = c[1] + "--" + str(d[1])
  772. final.append(country_code)
  773. place_name = 'LandMark1' + "--" + str(d[2])
  774. final.append(place_name)
  775. state_name = c[3] + "--" + str(d[3])
  776. final.append(state_name)
  777. state_code = c[4] + "--" + str(d[4])
  778. final.append(state_code)
  779. county_name = 'CityName1' + "--" + str(d[5])
  780. final.append(county_name)
  781. except (IndexError, NameError):
  782. final.append("PinCode1--")
  783. final.append("country_code--")
  784. final.append("LandMark1--")
  785. final.append("state_name--")
  786. final.append("state_code--")
  787. final.append("CityName1--")
  788. ######################################################## json #####################################################################
  789. import pandas as pd
  790. df = pd.DataFrame(final)
  791. df1 = df[0].str.split('--', expand=True)
  792. # print(df1)
  793. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  794. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  795. df1['Keys'] = df1['Keys'].str.strip()
  796. df1.to_csv('path123.csv', index=False)
  797. df2 = pd.read_csv('path123.csv')
  798. print(df2)
  799. df2 = df2.T
  800. df2.to_csv('path1.csv', index=False, header=False)
  801. df1 = pd.read_csv('path1.csv')
  802. df1.to_json('firstjson1.json', orient="index")
  803. import json
  804. with open('firstjson1.json', 'r') as json_file:
  805. json_load = json.load(json_file)
  806. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  807. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  808. # # print('--------------------------------------------------------------------------')
  809. # # print(nothing)
  810. empty = []
  811. import base64
  812. name = found
  813. image = open(name, 'rb')
  814. image_read = image.read()
  815. image_64_encode = base64.b64encode(image_read)
  816. NULL = 'null'
  817. empty.append("ByteData--" + (NULL).strip('""'))
  818. image_64_encode = image_64_encode.decode('utf-8')
  819. empty.append("FileData--" + str(image_64_encode))
  820. imagedata = name.split("/")
  821. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  822. imagename1 = str(imagename).split('.')
  823. imagename = str(imagename1[-2]).replace("[", "]")
  824. empty.append("FileName--" + imagename)
  825. empty.append("FilePath--"+ "")
  826. imageExtension = str(imagename1[-1]).replace("[", "]")
  827. empty.append("FileType--" + imageExtension)
  828. image.close()
  829. import pandas as pd
  830. df = pd.DataFrame(empty)
  831. df = df[0].str.split("--", expand=True)
  832. data1 = pd.DataFrame(df[0])
  833. data2 = pd.DataFrame(df[1])
  834. dt = data2.set_index(data1[0])
  835. dt4 = dt.T
  836. dictionary = dt4.to_dict(orient="index")
  837. list1 = []
  838. # list.append(a)
  839. list1.append(dictionary[1])
  840. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  841. print('--------------------')
  842. # print(namelist)
  843. import json
  844. # JSON data:
  845. x = nothing
  846. # python object to be appended
  847. y = {"image": dictionary[1]}
  848. # parsing JSON string:
  849. z = json.loads(x)
  850. # appending the data
  851. z.update(y)
  852. # the result is a JSON string:
  853. # print(json.dumps(z))
  854. zlist=[]
  855. zlist.append(z)
  856. #############################################creating csv#####################################
  857. print(final)
  858. print(imagelist)
  859. final.append('image--' + str(imagelist))
  860. import requests
  861. import json
  862. url = "https://anwi.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create/list" #dev
  863. # url = "https://qa.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" #testing
  864. # url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create" # test
  865. # url='http://localhost:3088/apis/v4/bizgaze/integrations/businesscards/create'
  866. # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  867. payload1 = json.dumps(zlist)
  868. # print('--------------------------------------------------------------------------')
  869. #print(payload1)
  870. headers = {
  871. #'Authorization': 'stat 1a936137490040c997928f485e3cdd7a', #dev
  872. 'Authorization': 'stat 16516391d0074f4c8a15ea16fb49470b',#testing
  873. # 'Authorization': 'stat 08e55fcfbaa940c8ab8145a074c444d1',
  874. # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  875. # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
  876. 'Content-Type': 'application/json'
  877. }
  878. response = requests.request("POST", url, headers=headers, data=payload1)
  879. # print("##############################################################")
  880. #print(payload1)
  881. print(response.text)
  882. import os
  883. if 'BusinessCards Created Successfully' in response.text:
  884. print('present')
  885. os.remove(found)
  886. else:
  887. print('not present')
  888. df1.to_json('visitingcard.json')
  889. data = df1.to_json('visiting.json', orient='records')
  890. print(data)
  891. #return render_template('index.html')
  892. return response.text
  893. # return 'done'
  894. if __name__ == "__main__":
  895. app.run(host='0.0.0.0', port=1112)