No Description
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Business_cards.py 81KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207
  1. from flask import Flask, render_template, request, redirect, Response, send_file
  2. import os
  3. import requests
  4. import pandas as pd
  5. import pgeocode
  6. from email_scraper import scrape_emails
  7. import phonenumbers
  8. from pdfminer.high_level import extract_text
  9. import pytesseract
  10. import time
  11. import multiprocessing
  12. from PIL import Image
  13. from functools import partial
  14. from urlextract import URLExtract
  15. import pytesseract as tess
  16. from PIL import Image
  17. # from doctr.io import DocumentFile
  18. # from doctr.models import ocr_predictor
  19. # model = ocr_predictor(pretrained=True)
  20. # load tagger
  21. # import spacy
  22. # nlp_model1 = spacy.load('./ADD300_new3.0')
  23. from flair.data import Sentence
  24. from flair.models import SequenceTagger
  25. from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
  26. tokenizer = AutoTokenizer.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  27. model = AutoModelForTokenClassification.from_pretrained("ml6team/bert-base-uncased-city-country-ner")
  28. from paddleocr import PaddleOCR, draw_ocr
  29. ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=True)
  30. tagger = SequenceTagger.load("flair/ner-english-large")
  31. import datetime
  32. app = Flask(__name__)
  33. # app.config["IMAGE_UPLOADS"] = "C:/inetpub/wwwroot/FlaskApplication/Flask_Demo/upload/"
  34. @app.route('/', methods=['GET'])
  35. def resume():
  36. return render_template('index.html')
  37. #@app.route('/upload_BusinessCards', methods=["POST"])
  38. def predict(Dataset):
  39. print('################## single card detection #######################')
  40. starttime = datetime.datetime.now()
  41. print('Execution Started at:', starttime)
  42. # print(Dataset)
  43. import os
  44. # if request.method == "POST":
  45. # if request.files:
  46. # image = request.files["image"]
  47. # try:
  48. # image.save(os.path.join(
  49. # app.config["IMAGE_UPLOADS"], image.filename))
  50. # except IsADirectoryError:
  51. # return render_template('card.html')
  52. # # image.save(os.path.join(
  53. # # app.config["IMAGE_UPLOADS"], image.filename))
  54. # print("Image saved")
  55. # return redirect(request.url)
  56. #url_list = request.get_json()
  57. # print(Dataset)
  58. # print(url_list)
  59. #dataset = request.get_json()
  60. # print(data)
  61. # data = {'visiting': Dataset}
  62. a=Dataset[0]
  63. #a = url_list
  64. # print(a)
  65. x = a['FileData']
  66. # print(x)
  67. y = a['FileName']
  68. z = a['FileType']
  69. # CreatedBy=a['CreatedBy']
  70. name = y + '.' + z
  71. # print(name)
  72. # print(y)
  73. # image = y.split("/")
  74. # filename=image[-1]
  75. # print(x)
  76. img_data = x.encode()
  77. import base64
  78. with open('./upload/' + name, "wb") as fh:
  79. fh.write(base64.decodebytes(img_data))
  80. import re
  81. import csv
  82. import glob
  83. import os
  84. # import pytesseract
  85. # import cv2
  86. import numpy as np
  87. import glob
  88. import os
  89. import cv2
  90. import requests
  91. final = []
  92. # final.append('assignto--'+CreatedBy)
  93. imagelist = []
  94. # print(found)
  95. remove_list = []
  96. import os
  97. import glob
  98. import pdfminer
  99. # import os
  100. # ts = 0
  101. # for file_name in glob.glob('./upload/*'):
  102. # fts = os.path.getmtime(file_name)
  103. # if fts > ts:
  104. # ts = fts
  105. # found = file_name
  106. found = './upload/' + name
  107. print(found)
  108. extension = found.split('.')[-1]
  109. # print(extension)
  110. def org_name():
  111. print('org_name is working')
  112. import pytesseract
  113. fname = found
  114. if extension != 'pdf':
  115. img = cv2.imread(fname)
  116. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  117. cv2.imwrite(str(found), img)
  118. from PIL import Image
  119. im = Image.open(found)
  120. im.save("images1.png", dpi=(1200, 1200))
  121. # import pytesseract
  122. fname = "images1.png"
  123. import pytesseract as tess
  124. from PIL import Image
  125. tess.pytesseract.tesseract_cmd=r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
  126. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  127. with open("demo.pdf","w+b",) as f:
  128. f.write(pdf)
  129. from pdfminer.high_level import extract_text
  130. text = extract_text('demo.pdf')
  131. # doc = DocumentFile.from_images(found)
  132. # result = model(doc)
  133. # text = result.render()
  134. # from pdfminer.high_level import extract_text
  135. # txt = extract_text('demo.pdf')
  136. else:
  137. from pdfminer.high_level import extract_text
  138. text = extract_text(fname)
  139. sentence = Sentence(text)
  140. # predict NER tags
  141. tagger.predict(sentence)
  142. # print sentence
  143. ko = (sentence)
  144. ko1 = str(ko).split("→")
  145. import pandas as pd
  146. dfg = []
  147. try:
  148. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  149. # os.remove(found)
  150. # return 'Invalid image'
  151. dfg.append(s)
  152. df = pd.DataFrame(dfg)
  153. df = df[0]
  154. df.to_csv("df.csv", index=False)
  155. df1 = pd.read_csv("df.csv")
  156. ve = df1["0"].str.split(",")
  157. fgf = ve.to_list()
  158. dfgh = pd.DataFrame(fgf[0])
  159. maindf = dfgh[0] # .str.split(":")
  160. # maindf.to_csv("main.csv")
  161. main1 = maindf.to_list()
  162. main1
  163. # cv=pd.DataFrame(ve)
  164. # cv
  165. per = ["PER"]
  166. org = ["ORG"]
  167. loc = ["LOC"]
  168. organizations = [i for i in main1 for j in org if j in i]
  169. PErsons = [i for i in main1 for j in per if j in i]
  170. location = [i for i in main1 for j in loc if j in i]
  171. except IndexError:
  172. pass
  173. # ************************************* ORGANIZATION ********************************************************************
  174. def organisation():
  175. print('organisation working ')
  176. try:
  177. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  178. '').replace(
  179. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  180. '').replace(
  181. '.com', ''))) < 4:
  182. pass
  183. else:
  184. match = str(urlfinal[0]).lower()
  185. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  186. 'https',
  187. '').replace(
  188. 'http', '').replace(":", "").replace("/", "").upper()
  189. print(match)
  190. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  191. '') + " /" + \
  192. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  193. s1 = s1g.upper()
  194. s2 = match.upper()
  195. from difflib import SequenceMatcher
  196. print(s1)
  197. print(s2)
  198. print(SequenceMatcher(None, s1, s2).ratio())
  199. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  200. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  201. final.append(
  202. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  203. '').replace(
  204. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  205. '').replace(
  206. '.com',
  207. '').replace(']', ''))
  208. else:
  209. final.append("OrganizationName--" + s2)
  210. except IndexError:
  211. try:
  212. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  213. '').replace(
  214. '"',
  215. '').replace(
  216. '.com', '').replace('.in', ''))) < 4:
  217. pass
  218. else:
  219. match = str(urlfinal[0]).lower()
  220. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  221. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  222. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  223. s1 = s1g.upper()
  224. s2 = match.upper()
  225. from difflib import SequenceMatcher
  226. print(s1)
  227. print(s2)
  228. print(SequenceMatcher(None, s1, s2).ratio())
  229. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  230. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  231. final.append(
  232. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  233. '').replace(
  234. ']', '').replace(
  235. '.com', ''))
  236. else:
  237. final.append("OrganizationName--" + s2)
  238. except IndexError:
  239. try:
  240. match = str(urlfinal[0]).lower()
  241. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').upper()
  242. final.append("OrganizationName--" + match)
  243. # remove_list.append(match)
  244. except IndexError:
  245. company()
  246. #################################################company Name########################################
  247. def company():
  248. print('company list working')
  249. import re
  250. new = []
  251. with open('test.txt', 'r+') as f:
  252. flag = False
  253. for line in f:
  254. line = line.upper()
  255. matches = re.findall(
  256. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  257. line)
  258. for i in matches:
  259. if i in line:
  260. flag = True
  261. if flag:
  262. o = "OrganizationName--" + line
  263. new.append(o)
  264. # if line.startswith('\n'):
  265. # flag = False
  266. try:
  267. a = new[0].replace('\n', '')
  268. final.append(a)
  269. except IndexError:
  270. final.append("OrganizationName--")
  271. # ************************************* CONTACT PERSON *******************************************************************
  272. def contactpersonname():
  273. print('contactpersonname working')
  274. try:
  275. final.append(
  276. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  277. "") + '/' +
  278. PErsons[
  279. 1].replace(":PER", "").replace('"', ''))
  280. except IndexError:
  281. try:
  282. final.append(
  283. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  284. '"', ''))
  285. except IndexError:
  286. final.append("CONTACTPERSONNAME--")
  287. def image_to_text():
  288. # doc = DocumentFile.from_images(found)
  289. # result = model(doc)
  290. # image_to_text.txt = result.render()
  291. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  292. # img = Image.open(found)
  293. # text = tess.image_to_string(img)
  294. # image_to_text.txt = text
  295. # print(text)
  296. import cv2
  297. img_path = found
  298. img = cv2.imread(img_path)
  299. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  300. cv2.imwrite(str(found), img)
  301. result = ocr.ocr(img_path, cls=True)
  302. result = result[0]
  303. txts = [line[1][0] for line in result]
  304. image_to_text.txt = ""
  305. for i in txts:
  306. if len(i) < 4:
  307. continue
  308. # print(i+"\n")
  309. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  310. # print(image_to_text.txt)
  311. def pdf_to_text():
  312. from pdfminer.high_level import extract_text
  313. pdf_to_text.txt = extract_text(found)
  314. # pdf_to_text.txt= text.replace('\n', ' ')
  315. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  316. if extension in extensionlist:
  317. print('image' + extension)
  318. image_to_text()
  319. x = image_to_text.txt
  320. else:
  321. print('pdf' + extension)
  322. pdf_to_text()
  323. x = pdf_to_text.txt
  324. verticaltext = x
  325. htext = x
  326. # print('------------------------------------------------')
  327. print(
  328. '############################################################# this is verticaltext #################################################################')
  329. print(verticaltext)
  330. htext = htext.replace('\n', ' ')
  331. print(
  332. '############################################################# this is htext #############################################################')
  333. print(htext)
  334. y = x.replace('\n', ',')
  335. y = y.replace(' ', ' ')
  336. # y = y.replace(".", " .")
  337. horizontaltext = y
  338. # print('------------------------------------------------')
  339. print(
  340. '############################################################# this is horizontaltext #############################################################')
  341. print(horizontaltext)
  342. textfile = open("test123456.txt", "w")
  343. a = textfile.write(verticaltext)
  344. textfile.close()
  345. textfile = open("vtext.txt", "w")
  346. a = textfile.write(horizontaltext)
  347. textfile.close()
  348. with open('test123456.txt', 'r') as f:
  349. with open('test.txt', 'w') as w:
  350. for line in f:
  351. if line.strip().replace('|', ''):
  352. w.write(line)
  353. ###########################ADDRESS##################################
  354. addrespinlst = []
  355. def splitaddress():
  356. import re
  357. textaddress = htext.replace('\n', ' ')
  358. # print(textaddress)
  359. address1 = (textaddress.partition(",")[0])
  360. words = address1.split()
  361. address1 = words[-1]
  362. addre = (htext.partition(",")[2])
  363. a = addre.replace('\n', ' ').replace('\x0c', '')
  364. addre = (a.partition(",")[2])
  365. matches = re.findall(
  366. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  367. a)
  368. for match in matches:
  369. address2 = match
  370. address2 = str(address2)
  371. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ', '')
  372. matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  373. for address3 in matches:
  374. pass
  375. try:
  376. Address = address1 + "," + address2 + "," + address3
  377. final.append('ADDRESS--' + Address)
  378. addrespinlst.append(Address)
  379. except NameError:
  380. print(
  381. '############################################################ Addressmodelworking #############################################################')
  382. # doc = nlp_model1(textaddress)
  383. # addlist = []
  384. # for ent in doc.ents:
  385. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  386. # addlist.append(name)
  387. # try:
  388. # Address = addlist[0]
  389. # final.append(Address)
  390. # addrespinlst.append(Address)
  391. # remove_list.append(
  392. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  393. # "ADDRESS--",
  394. # ""))
  395. # except IndexError:
  396. # final.append("ADDRESS--")
  397. pass
  398. ################################################## website#######################################################
  399. # import re
  400. # url = []
  401. # matches = re.findall(r'www.*', verticaltext)
  402. # for match in matches:
  403. # if (match.count('.')) == 1:
  404. # a_string1 = match.replace("www", "www.")
  405. # final.append("Urls--" + a_string1)
  406. # url.append(a_string1)
  407. # else:
  408. # final.append("Urls--" + match)
  409. # if len(url)==0:
  410. # from urlextract import URLExtract
  411. # extractor = URLExtract()
  412. # urls = extractor.find_urls(verticaltext)
  413. # try:
  414. # urllist = urls[0]
  415. # final.append("Urls--"+urllist)
  416. # url.append(urllist)
  417. # except IndexError:
  418. # final.append("Urls--")
  419. # for match in matches:
  420. # if (match.count('.')) == 1:
  421. # a_string1 = match.replace("www", "www.")
  422. # final.append("Urls--" + a_string1)
  423. # url.append(a_string1)
  424. # else:
  425. # final.append("Urls--" + match)
  426. # url.append(match)
  427. # remove_list.append(match)
  428. # else:
  429. # final.append("Urls--" )
  430. ################################################## website#######################################################
  431. import re
  432. # final=[]
  433. url = []
  434. urlfinal = []
  435. matches = re.findall(r'www.*', verticaltext)
  436. for match in matches:
  437. if (match.count('.')) == 1:
  438. a_string1 = match.replace("www", "www.")
  439. # final.append("Urls--" + a_string1)
  440. url.append(a_string1)
  441. else:
  442. url.append(match)
  443. if len(url) == 0:
  444. from urlextract import URLExtract
  445. extractor = URLExtract()
  446. urls = extractor.find_urls(verticaltext)
  447. try:
  448. urllist = urls[0]
  449. url.append(urllist)
  450. url.append(urllist)
  451. except IndexError:
  452. pass
  453. for match in matches:
  454. if (match.count('.')) == 1:
  455. a_string1 = match.replace("www", "www.")
  456. url.append(a_string1)
  457. # url.append(a_string1)
  458. else:
  459. url.append(match)
  460. url.append(match)
  461. else:
  462. pass
  463. try:
  464. test_string = url[0]
  465. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  466. res = [ele for ele in test_list if (ele in test_string)]
  467. if len(res) == 0:
  468. print('no match')
  469. final.append('urls--')
  470. else:
  471. print('matched')
  472. final.append('urls--' + url[0])
  473. urlfinal.append(url[0])
  474. except IndexError:
  475. final.append('urls--')
  476. print(
  477. '############################################################# url #############################################################')
  478. print(url)
  479. #######organisation and contact################
  480. # def company_url():
  481. # # print('--url--')
  482. # # print(url)
  483. # try:
  484. # match = str(url[0]).lower()
  485. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  486. # final.append("OrganizationName--" + match)
  487. # # remove_list.append(match)
  488. # except IndexError:
  489. # org_name()
  490. # organisation()
  491. # final.append("OrganizationName--")
  492. # make example sentence
  493. # print(horizontaltext)
  494. sentence = Sentence(verticaltext)
  495. # predict NER tags
  496. tagger.predict(sentence)
  497. # print sentence
  498. ko = (sentence)
  499. ko1 = str(ko).split("→")
  500. import pandas as pd
  501. dfg = []
  502. try:
  503. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  504. except IndexError:
  505. os.remove(found)
  506. return 'Invalid image'
  507. dfg.append(s)
  508. df = pd.DataFrame(dfg)
  509. df = df[0]
  510. df.to_csv("df.csv", index=False)
  511. df1 = pd.read_csv("df.csv")
  512. ve = df1["0"].str.split(",")
  513. fgf = ve.to_list()
  514. dfgh = pd.DataFrame(fgf[0])
  515. maindf = dfgh[0] # .str.split(":")
  516. # maindf.to_csv("main.csv")
  517. main1 = maindf.to_list()
  518. main1
  519. # cv=pd.DataFrame(ve)
  520. # cv
  521. per = ["PER"]
  522. org = ["ORG"]
  523. loc = ["LOC"]
  524. organizations = [i for i in main1 for j in org if j in i]
  525. PErsons = [i for i in main1 for j in per if j in i]
  526. location = [i for i in main1 for j in loc if j in i]
  527. # ************************************* ORGANIZATION ********************************************************************
  528. try:
  529. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(
  530. ']', '').replace(
  531. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  532. pass
  533. # company_url()
  534. else:
  535. match = str(urlfinal[0]).lower()
  536. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace('https',
  537. '').replace(
  538. 'http', '').replace(":", "").replace("/", "").upper()
  539. print(match)
  540. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  541. '.com', '') + " /" + \
  542. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  543. s1 = s1g.upper()
  544. s2 = match.upper()
  545. from difflib import SequenceMatcher
  546. print(s1)
  547. print(s2)
  548. print(SequenceMatcher(None, s1, s2).ratio())
  549. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  550. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  551. final.append("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  552. '').replace(
  553. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace(
  554. '.com', '').replace(']', ''))
  555. else:
  556. final.append("OrganizationName--" + s2)
  557. except IndexError:
  558. try:
  559. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  560. '').replace(
  561. '"',
  562. '').replace(
  563. '.com', ''))) < 4:
  564. pass
  565. # company_url()
  566. else:
  567. match = str(urlfinal[0]).lower()
  568. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  569. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  570. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  571. '.com', '')
  572. s1 = s1g.upper()
  573. s2 = match.upper()
  574. from difflib import SequenceMatcher
  575. print(s1)
  576. print(s2)
  577. print(SequenceMatcher(None, s1, s2).ratio())
  578. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  579. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  580. final.append(
  581. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  582. '').replace(
  583. ']', '').replace(
  584. '.com', '').replace(']', ''))
  585. else:
  586. final.append("OrganizationName--" + s2)
  587. except IndexError:
  588. org_name()
  589. organisation()
  590. # final.append("OrganizationName--")
  591. # ************************************* CONTACT PERSON *******************************************************************
  592. try:
  593. final.append(
  594. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]", "") +
  595. PErsons[
  596. 1].replace(":PER", "").replace('"', ''))
  597. except IndexError:
  598. try:
  599. final.append(
  600. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace('"',
  601. ''))
  602. except IndexError:
  603. org_name()
  604. contactpersonname()
  605. # final.append("CONTACTPERSONNAME--")
  606. ###############address flair#####################
  607. try:
  608. print(
  609. '############################################################# address new code #############################################################')
  610. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  611. loclst = [i for i in loactionlst if i in htext.lower()]
  612. textaddress = htext
  613. textaddress = textaddress.replace("|", ",")
  614. textaddress = textaddress.lower()
  615. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  616. grop = nlp(textaddress)
  617. citycountry = []
  618. print('########################### city or country name ###########################')
  619. d = grop[-1]
  620. if d['entity_group'] == "COUNTRY":
  621. print(d["word"])
  622. citycountry.append(d["word"])
  623. elif d['entity_group'] == "CITY":
  624. print(d["word"])
  625. citycountry.append(d["word"])
  626. try:
  627. address1 = loclst[0]
  628. except IndexError:
  629. address1 = (textaddress.partition(",")[0])
  630. words = address1.split()
  631. address1 = words[-1]
  632. star_location = address1.lower()
  633. end_location = citycountry[0].replace("#", "")
  634. start = star_location
  635. end = end_location
  636. s = textaddress.lower()
  637. middle_address = (s.split(start))[-1].split(end)[0]
  638. Address = start + middle_address + end
  639. Address = Address.replace('--', '').title()
  640. print(Address)
  641. if Address.count(',') < 2:
  642. splitaddress()
  643. else:
  644. final.append('ADDRESS--' + Address)
  645. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  646. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  647. # d1 = star_location.split()
  648. # d2 = end_location.split()
  649. # d3 = d1[0]
  650. # d4 = d2[0]
  651. # start = d3
  652. # end = d4
  653. # s = horizontaltext
  654. # middle_address = ((s.split(start))[1].split(end)[0])
  655. # Address = d3 + middle_address + d4
  656. # final.append('ADDRESS--' + Address)
  657. # addrespinlst.append(Address)
  658. except IndexError:
  659. splitaddress()
  660. ########################################## Designation ###########################################
  661. import re
  662. new = []
  663. with open('test.txt', 'r') as f:
  664. flag = False
  665. for line in f:
  666. line1 = line
  667. line = line.upper()
  668. matches = re.findall(
  669. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  670. line)
  671. for match in matches:
  672. line = line.replace('-', '')
  673. # print(line)
  674. o = "Designation--" + line
  675. new.append(o)
  676. remove_list.append(str(line1).replace('\n', ''))
  677. try:
  678. a = new[0].replace('\n', '')
  679. final.append(a)
  680. except IndexError:
  681. final.append("Designation--")
  682. ###################################################Phone number#################################################
  683. num = []
  684. import phonenumbers
  685. # print(verticaltext)
  686. numbers = phonenumbers.PhoneNumberMatcher(
  687. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
  688. for number in numbers:
  689. number = str(number).split(")")
  690. num.append(number[1])
  691. # num.append(number[-1])
  692. if len(num) == 0:
  693. final.append("ContactNumber--")
  694. final.append("OrganizationNumber--")
  695. elif len(num) > 1:
  696. final.append("ContactNumber--" + num[0].replace(' ', ''))
  697. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  698. elif len(num) == 1:
  699. try:
  700. final.append("ContactNumber--" + num[0].replace(' ', ''))
  701. final.append("OrganizationNumber--")
  702. except IndexError:
  703. final.append("ContactNumber--")
  704. final.append("OrganizationNumber--")
  705. print(
  706. '############################################################# num #############################################################')
  707. print(num)
  708. # try:
  709. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  710. # remove_list.append(num[0])
  711. # except IndexError:
  712. # pass
  713. # try:
  714. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  715. # remove_list.append(num[1])
  716. # except IndexError:
  717. # pass
  718. # try:
  719. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  720. # remove_list.append(num[2])
  721. # except IndexError:
  722. # pass
  723. ################################################### Email######################################################
  724. import re
  725. from email_scraper import scrape_emails
  726. s = list(scrape_emails(horizontaltext))
  727. email_id = s
  728. # email_id = []
  729. # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
  730. # for match in matches:
  731. # email_id.append(match)
  732. # # final.append('Email--' + match)
  733. # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
  734. # # final.append(email_)
  735. # # final.append('Email--' + email_)
  736. # # remove_list.append(email_)
  737. if len(email_id) > 1:
  738. final.append(
  739. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'", ""))
  740. final.append(
  741. 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  742. ""))
  743. else:
  744. try:
  745. final.append(
  746. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  747. ""))
  748. final.append('OrganizationEmail--')
  749. except IndexError:
  750. final.append('ContactEmail--')
  751. final.append('OrganizationEmail--')
  752. ###############PINCODE############
  753. pinlst = []
  754. print(addrespinlst)
  755. import pgeocode
  756. # try:
  757. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  758. # for i in matche1:
  759. # address3 = i.replace(' ', '').replace('-', '')
  760. # pinlst.append(address3)
  761. # except IndexError:
  762. lst = []
  763. for i in num:
  764. i = i[1:]
  765. lst.append(i)
  766. infile = r"vtext.txt"
  767. outfile = r"cleaned_file.txt"
  768. import glob
  769. delete_list = lst
  770. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  771. fin = open(infile, "r+")
  772. fout = open(outfile, "w+")
  773. for line12 in fin:
  774. for word in delete_list:
  775. line12 = line12.replace(word, "")
  776. fout.write(line12)
  777. fin.close()
  778. # print(line)
  779. # print(addrespinlst)
  780. import pgeocode
  781. print(line12)
  782. import re
  783. matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  784. for i in matche1:
  785. address3 = i.replace(' ', '').replace('-', '')
  786. pinlst.append(address3)
  787. nomi = pgeocode.Nominatim('IN')
  788. try:
  789. a = nomi.query_postal_code(str(pinlst[-1]))
  790. # print(a)
  791. b = a.keys()
  792. c = b.values.tolist()
  793. d = a.tolist()
  794. postal_code = "PinCode1" + "--" + d[0]
  795. final.append(postal_code)
  796. country_code = c[1] + "--" + str(d[1])
  797. final.append(country_code)
  798. place_name = 'LandMark1' + "--" + str(d[2])
  799. final.append(place_name)
  800. state_name = c[3] + "--" + str(d[3])
  801. final.append(state_name)
  802. state_code = c[4] + "--" + str(d[4])
  803. final.append(state_code)
  804. county_name = 'CityName1' + "--" + str(d[5])
  805. final.append(county_name)
  806. except (IndexError, NameError):
  807. final.append("PinCode1--")
  808. final.append("country_code--")
  809. final.append("LandMark1--")
  810. final.append("state_name--")
  811. final.append("state_code--")
  812. final.append("CityName1--")
  813. ######################################################## json #####################################################################
  814. import pandas as pd
  815. df = pd.DataFrame(final)
  816. df1 = df[0].str.split('--', expand=True)
  817. # print(df1)
  818. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  819. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  820. df1['Keys']=df1['Keys'].str.strip()
  821. df1.to_csv('path12.csv', index=False)
  822. df2 = pd.read_csv('path12.csv')
  823. print(final)
  824. print(df2)
  825. df2 = df2.T
  826. df2.to_csv('path.csv', index=False, header=False)
  827. df1 = pd.read_csv('path.csv')
  828. df1.to_json('firstjson.json', orient="index")
  829. import json
  830. with open('firstjson.json', 'r') as json_file:
  831. json_load = json.load(json_file)
  832. # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  833. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  834. # print('--------------------------------------------------------------------------')
  835. # print(nothing)
  836. empty = []
  837. import base64
  838. name = found
  839. image = open(name, 'rb')
  840. image_read = image.read()
  841. image_64_encode = base64.b64encode(image_read)
  842. NULL = 'null'
  843. empty.append("ByteData--" + (NULL).strip('""'))
  844. image_64_encode = image_64_encode.decode('utf-8')
  845. empty.append("FileData--" + str(image_64_encode))
  846. imagedata = name.split("/")
  847. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  848. imagename1 = str(imagename).split('.')
  849. imagename = str(imagename1[-2]).replace("[", "]")
  850. empty.append("FileName--" + imagename)
  851. empty.append("FilePath--" + found)
  852. imageExtension = str(imagename1[-1]).replace("[", "]")
  853. empty.append("FileType--" + imageExtension)
  854. image.close()
  855. import pandas as pd
  856. df = pd.DataFrame(empty)
  857. df = df[0].str.split("--", expand=True)
  858. data1 = pd.DataFrame(df[0])
  859. data2 = pd.DataFrame(df[1])
  860. dt = data2.set_index(data1[0])
  861. dt4 = dt.T
  862. dictionary = dt4.to_dict(orient="index")
  863. list1 = []
  864. # list.append(a)
  865. list1.append(dictionary[1])
  866. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  867. print('--------------------')
  868. # print(namelist)
  869. import json
  870. # JSON data:
  871. x = nothing
  872. # python object to be appended
  873. y = {"image": dictionary[1]}
  874. # parsing JSON string:
  875. z = json.loads(x)
  876. # appending the data
  877. z.update(y)
  878. # the result is a JSON string:
  879. # print(json.dumps(z))
  880. # print('##########################')
  881. # #print(z)
  882. # print('##########################')
  883. # #############################################creating csv#####################################
  884. # # print(final)
  885. # # print(imagelist)
  886. # final.append('image--' + str(imagelist))
  887. #
  888. # import requests
  889. # import json
  890. # with open('visitingcard1.json', 'r') as json_file:
  891. # json_load = json.load(json_file)
  892. # url = "https://demo.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create"
  893. url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create"
  894. # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  895. payload1 = json.dumps(z)
  896. # print('--------------------------------------------------------------------------')
  897. # print(payload1)
  898. headers = {
  899. 'Authorization': 'stat db226c95fae04943aa3e3c03a4381b2a',
  900. # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  901. # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demosss
  902. 'Content-Type': 'application/json'
  903. }
  904. response = requests.request("POST", url, headers=headers, data=payload1)
  905. print("##############################################################")
  906. # print(payload1)
  907. print(response.text)
  908. if 'BusinessCards Created Successfully' in response.text:
  909. print('present')
  910. os.remove(found)
  911. else:
  912. print('not present')
  913. # df1.to_json('visitingcard.json')
  914. # data = df1.to_json('visiting.json', orient='records')
  915. # print(data)
  916. # return render_template('index.html')
  917. # print('Time Taken:',total)
  918. endtime = datetime.datetime.now()
  919. print('Completed at:', endtime)
  920. print(starttime)
  921. print(endtime)
  922. print('--------------------------')
  923. # z=end-start
  924. # print('Time Taken:',z)
  925. # return response.text
  926. # return 'done'
  927. return response.text
  928. #@app.route('/upload_BusinessCards', methods=["POST"])
  929. #@app.route('/multiplecards', methods=["POST"])
  930. def multiplecards(Dataset):
  931. print('################## multiple card detection #######################')
  932. #print(Dataset)
  933. #dataset = request.get_json()
  934. # print(data)
  935. data = {'visiting': Dataset}
  936. for i in data['visiting']:
  937. import time
  938. #time.sleep(1)
  939. a = i
  940. x = a['FileData']
  941. # print(x)
  942. y = a['FileName']
  943. z = a['FileType']
  944. # CreatedBy=a['CreatedBy']
  945. name = y + '.' + z
  946. # print(name)
  947. # print(y)
  948. # image = y.split("/")
  949. # filename=image[-1]
  950. # print(x)
  951. img_data = x.encode()
  952. import base64
  953. with open('./multicards/' + name, "wb") as fh:
  954. fh.write(base64.decodebytes(img_data))
  955. # print(i)
  956. # import os
  957. # import glob
  958. # for i in glob.glob('./multipleupload/*'):
  959. found = './multicards/' + name
  960. print(found)
  961. extension = found.split('.')[-1]
  962. # for root, dirs, fils in os.glob('./multipleupload'):
  963. # for name in files:
  964. # foundfile= os.path.join(root, name)
  965. # print(foundfile)
  966. import re
  967. import csv
  968. import glob
  969. import os
  970. # import pytesseract
  971. # import cv2
  972. import numpy as np
  973. import glob
  974. import os
  975. import cv2
  976. import requests
  977. final = []
  978. # final.append('assignto--'+CreatedBy)
  979. imagelist = []
  980. # print(found)
  981. remove_list = []
  982. import os
  983. import glob
  984. import pdfminer
  985. # import os
  986. # ts = 0
  987. # for file_name in glob.glob('./upload/*'):
  988. # fts = os.path.getmtime(file_name)
  989. # if fts > ts:
  990. # ts = fts
  991. # found = file_name
  992. # print(found)
  993. # print(extension)
  994. def org_name():
  995. print('org_name is working')
  996. import pytesseract
  997. fname = found
  998. if extension != 'pdf':
  999. img = cv2.imread(fname)
  1000. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  1001. cv2.imwrite(str(found), img)
  1002. from PIL import Image
  1003. im = Image.open(found)
  1004. im.save("images1.png", dpi=(1200, 1200))
  1005. # import pytesseract
  1006. fname = "images1.png"
  1007. import pytesseract as tess
  1008. from PIL import Image
  1009. tess.pytesseract.tesseract_cmd=r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
  1010. pdf = tess.image_to_pdf_or_hocr(fname, extension="pdf")
  1011. with open("demo.pdf","w+b",) as f:
  1012. f.write(pdf)
  1013. from pdfminer.high_level import extract_text
  1014. text = extract_text('demo.pdf')
  1015. # doc = DocumentFile.from_images(found)
  1016. # result = model(doc)
  1017. # text = result.render()
  1018. # from pdfminer.high_level import extract_text
  1019. # txt = extract_text('demo.pdf')
  1020. else:
  1021. from pdfminer.high_level import extract_text
  1022. text = extract_text(fname)
  1023. sentence = Sentence(text)
  1024. # predict NER tags
  1025. tagger.predict(sentence)
  1026. # print sentence
  1027. ko = (sentence)
  1028. ko1 = str(ko).split("→")
  1029. import pandas as pd
  1030. dfg = []
  1031. try:
  1032. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  1033. # os.remove(found)
  1034. # return 'Invalid image'
  1035. dfg.append(s)
  1036. df = pd.DataFrame(dfg)
  1037. df = df[0]
  1038. df.to_csv("df.csv", index=False)
  1039. df1 = pd.read_csv("df.csv")
  1040. ve = df1["0"].str.split(",")
  1041. fgf = ve.to_list()
  1042. dfgh = pd.DataFrame(fgf[0])
  1043. maindf = dfgh[0] # .str.split(":")
  1044. # maindf.to_csv("main.csv")
  1045. main1 = maindf.to_list()
  1046. main1
  1047. # cv=pd.DataFrame(ve)
  1048. # cv
  1049. per = ["PER"]
  1050. org = ["ORG"]
  1051. loc = ["LOC"]
  1052. organizations = [i for i in main1 for j in org if j in i]
  1053. PErsons = [i for i in main1 for j in per if j in i]
  1054. location = [i for i in main1 for j in loc if j in i]
  1055. except IndexError:
  1056. pass
  1057. # ************************************* ORGANIZATION ********************************************************************
  1058. def organisation():
  1059. print('organisation working ')
  1060. try:
  1061. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  1062. '').replace(
  1063. '.com', '').replace('.in', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  1064. '').replace(
  1065. '.com', ''))) < 4:
  1066. pass
  1067. else:
  1068. match = str(urlfinal[0]).lower()
  1069. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  1070. 'https',
  1071. '').replace(
  1072. 'http', '').replace(":", "").replace("/", "").upper()
  1073. print(match)
  1074. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com',
  1075. '') + " /" + \
  1076. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  1077. s1 = s1g.upper()
  1078. s2 = match.upper()
  1079. from difflib import SequenceMatcher
  1080. print(s1)
  1081. print(s2)
  1082. print(SequenceMatcher(None, s1, s2).ratio())
  1083. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  1084. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  1085. final.append(
  1086. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  1087. '').replace(
  1088. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  1089. '').replace(
  1090. '.com',
  1091. '').replace(']', ''))
  1092. else:
  1093. final.append("OrganizationName--" + s2)
  1094. except IndexError:
  1095. try:
  1096. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  1097. '').replace(
  1098. '"',
  1099. '').replace(
  1100. '.com', '').replace('.in', ''))) < 4:
  1101. pass
  1102. else:
  1103. match = str(urlfinal[0]).lower()
  1104. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  1105. '').replace(
  1106. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  1107. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace('.com', '')
  1108. s1 = s1g.upper()
  1109. s2 = match.upper()
  1110. from difflib import SequenceMatcher
  1111. print(s1)
  1112. print(s2)
  1113. print(SequenceMatcher(None, s1, s2).ratio())
  1114. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  1115. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  1116. final.append(
  1117. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace(
  1118. '[',
  1119. '').replace(
  1120. ']', '').replace(
  1121. '.com', ''))
  1122. else:
  1123. final.append("OrganizationName--" + s2)
  1124. except IndexError:
  1125. try:
  1126. match = str(urlfinal[0]).lower()
  1127. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co',
  1128. '').upper()
  1129. final.append("OrganizationName--" + match)
  1130. # remove_list.append(match)
  1131. except IndexError:
  1132. company()
  1133. #################################################company Name########################################
  1134. def company():
  1135. print('company list working')
  1136. import re
  1137. new = []
  1138. with open('test.txt', 'r+') as f:
  1139. flag = False
  1140. for line in f:
  1141. line = line.upper()
  1142. matches = re.findall(
  1143. r'''\bENTERPRISE\b|\bTRADE\b|\bEMPIRE\b|\bSTORES\b|\bMACHINERY\b|\bINDUSTRIES\b|\bTECHNOLOGY\b|\bCOMPANY\b|\bDESIGNERS\b|\bPOLYMER\b|\bBELT\b|\bAGRO\b|\bPLASTIC\b|\bGROUP\b|\bTOOLS\b|\bENGG.\b|\bSOLUTION\b|\bCONSTRUCTION\b|\bPACK\b|\bELECT\b|\bSTEEL\b|\bIRON\b|\bDIES\b|\bMOULD\b|\bCORPORATION\b|\bSEEDS\b|\bPOWER\b|\bCONSULTANT\b|\bMFG.\b|\bPRINT\b|\bFOOD\b|\bSOLAR\b|\bINDUSTRY\b|\bLIMITED\b|\bPRIVATE\b|\bPVT\b|\bLTD\b|\bOUTSOURCING\b|\bCNC\b|\bMACHINERIES\b|\bSOLUTIONS\b|\bENGINEERS\b|\bWORKS\b|\bPRODUCTS\b|\bENTERPRISES\b|\bCOMPANIES\b|\bPOLYMERS\b|\bTRADING\b''',
  1144. line)
  1145. for i in matches:
  1146. if i in line:
  1147. flag = True
  1148. if flag:
  1149. o = "OrganizationName--" + line
  1150. new.append(o)
  1151. # if line.startswith('\n'):
  1152. # flag = False
  1153. try:
  1154. a = new[0].replace('\n', '')
  1155. final.append(a)
  1156. except IndexError:
  1157. final.append("OrganizationName--")
  1158. # ************************************* CONTACT PERSON *******************************************************************
  1159. def contactpersonname():
  1160. print('contactpersonname working')
  1161. try:
  1162. final.append(
  1163. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace(
  1164. "]",
  1165. "") + '/' +
  1166. PErsons[
  1167. 1].replace(":PER", "").replace('"', ''))
  1168. except IndexError:
  1169. try:
  1170. final.append(
  1171. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]",
  1172. "").replace(
  1173. '"', ''))
  1174. except IndexError:
  1175. final.append("CONTACTPERSONNAME--")
  1176. def image_to_text():
  1177. # doc = DocumentFile.from_images(found)
  1178. # result = model(doc)
  1179. # image_to_text.txt = result.render()
  1180. # tess.pytesseract.tesseract_cmd = r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
  1181. # img = Image.open(found)
  1182. # text = tess.image_to_string(img)
  1183. # image_to_text.txt = text
  1184. # print(text)
  1185. import cv2
  1186. img_path = found
  1187. img = cv2.imread(img_path)
  1188. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  1189. cv2.imwrite(str(found), img)
  1190. result = ocr.ocr(img_path, cls=True)
  1191. result = result[0]
  1192. txts = [line[1][0] for line in result]
  1193. image_to_text.txt = ""
  1194. for i in txts:
  1195. if len(i) < 4:
  1196. continue
  1197. # print(i+"\n")
  1198. image_to_text.txt = image_to_text.txt + str(i) + "\n"
  1199. # print(image_to_text.txt)
  1200. def pdf_to_text():
  1201. from pdfminer.high_level import extract_text
  1202. pdf_to_text.txt = extract_text(found)
  1203. # pdf_to_text.txt= text.replace('\n', ' ')
  1204. extensionlist = ['JPEG', 'jpg', 'png', 'JPG', 'PNG', 'jpeg']
  1205. if extension in extensionlist:
  1206. print('image' + extension)
  1207. image_to_text()
  1208. x = image_to_text.txt
  1209. else:
  1210. print('pdf' + extension)
  1211. pdf_to_text()
  1212. x = pdf_to_text.txt
  1213. verticaltext = x
  1214. htext = x
  1215. # print('------------------------------------------------')
  1216. print(
  1217. '############################################################# this is verticaltext #################################################################')
  1218. print(verticaltext)
  1219. htext = htext.replace('\n', ' ')
  1220. print(
  1221. '############################################################# this is htext #############################################################')
  1222. print(htext)
  1223. y = x.replace('\n', ',')
  1224. y = y.replace(' ', ' ')
  1225. # y = y.replace(".", " .")
  1226. horizontaltext = y
  1227. # print('------------------------------------------------')
  1228. print(
  1229. '############################################################# this is horizontaltext #############################################################')
  1230. print(horizontaltext)
  1231. textfile = open("test123456.txt", "w")
  1232. a = textfile.write(verticaltext)
  1233. textfile.close()
  1234. textfile = open("vtext.txt", "w")
  1235. a = textfile.write(horizontaltext)
  1236. textfile.close()
  1237. with open('test123456.txt', 'r') as f:
  1238. with open('test.txt', 'w') as w:
  1239. for line in f:
  1240. if line.strip().replace('|', ''):
  1241. w.write(line)
  1242. ###########################ADDRESS##################################
  1243. addrespinlst = []
  1244. def splitaddress():
  1245. import re
  1246. textaddress = htext.replace('\n', ' ')
  1247. # print(textaddress)
  1248. address1 = (textaddress.partition(",")[0])
  1249. words = address1.split()
  1250. address1 = words[-1]
  1251. addre = (htext.partition(",")[2])
  1252. a = addre.replace('\n', ' ').replace('\x0c', '')
  1253. addre = (a.partition(",")[2])
  1254. matches = re.findall(
  1255. r'(.*?)-\d{3} \d{3}|(.*?)\b-\d{6}\b|(.*?)\b\d{6}\b|(.*?)\b\d{3} \d{3}\b|\b(.*?)-\d{2}\b|(.*?)\b\d{3} \d{3}\b',
  1256. a)
  1257. for match in matches:
  1258. address2 = match
  1259. address2 = str(address2)
  1260. address2 = address2.replace("'", "").replace("(", "").replace(")", "").replace(', ,', '').replace(' ',
  1261. '')
  1262. matches = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|\b-\d{2}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', a)
  1263. for address3 in matches:
  1264. pass
  1265. try:
  1266. Address = address1 + "," + address2 + "," + address3
  1267. final.append('ADDRESS--' + Address)
  1268. addrespinlst.append(Address)
  1269. except NameError:
  1270. print(
  1271. '############################################################ Addressmodelworking #############################################################')
  1272. # doc = nlp_model1(textaddress)
  1273. # addlist = []
  1274. # for ent in doc.ents:
  1275. # name = (f'{ent.label_.upper():{10}}--{ent.text}')
  1276. # addlist.append(name)
  1277. # try:
  1278. # Address = addlist[0]
  1279. # final.append(Address)
  1280. # addrespinlst.append(Address)
  1281. # remove_list.append(
  1282. # str(Address).replace("[", "").replace("]", "").replace("\\n", "").replace("'", "").replace(
  1283. # "ADDRESS--",
  1284. # ""))
  1285. # except IndexError:
  1286. # final.append("ADDRESS--")
  1287. pass
  1288. ################################################## website#######################################################
  1289. # import re
  1290. # url = []
  1291. # matches = re.findall(r'www.*', verticaltext)
  1292. # for match in matches:
  1293. # if (match.count('.')) == 1:
  1294. # a_string1 = match.replace("www", "www.")
  1295. # final.append("Urls--" + a_string1)
  1296. # url.append(a_string1)
  1297. # else:
  1298. # final.append("Urls--" + match)
  1299. # if len(url)==0:
  1300. # from urlextract import URLExtract
  1301. # extractor = URLExtract()
  1302. # urls = extractor.find_urls(verticaltext)
  1303. # try:
  1304. # urllist = urls[0]
  1305. # final.append("Urls--"+urllist)
  1306. # url.append(urllist)
  1307. # except IndexError:
  1308. # final.append("Urls--")
  1309. # for match in matches:
  1310. # if (match.count('.')) == 1:
  1311. # a_string1 = match.replace("www", "www.")
  1312. # final.append("Urls--" + a_string1)
  1313. # url.append(a_string1)
  1314. # else:
  1315. # final.append("Urls--" + match)
  1316. # url.append(match)
  1317. # remove_list.append(match)
  1318. # else:
  1319. # final.append("Urls--" )
  1320. ################################################## website#######################################################
  1321. import re
  1322. # final=[]
  1323. url = []
  1324. urlfinal = []
  1325. matches = re.findall(r'www.*', verticaltext)
  1326. for match in matches:
  1327. if (match.count('.')) == 1:
  1328. a_string1 = match.replace("www", "www.")
  1329. # final.append("Urls--" + a_string1)
  1330. url.append(a_string1)
  1331. else:
  1332. url.append(match)
  1333. if len(url) == 0:
  1334. from urlextract import URLExtract
  1335. extractor = URLExtract()
  1336. urls = extractor.find_urls(verticaltext)
  1337. try:
  1338. urllist = urls[0]
  1339. url.append(urllist)
  1340. url.append(urllist)
  1341. except IndexError:
  1342. pass
  1343. for match in matches:
  1344. if (match.count('.')) == 1:
  1345. a_string1 = match.replace("www", "www.")
  1346. url.append(a_string1)
  1347. # url.append(a_string1)
  1348. else:
  1349. url.append(match)
  1350. url.append(match)
  1351. else:
  1352. pass
  1353. try:
  1354. test_string = url[0]
  1355. test_list = ['com', 'www', 'in', 'co', "WWW", "COM", "CO", "IN"]
  1356. res = [ele for ele in test_list if (ele in test_string)]
  1357. if len(res) == 0:
  1358. print('no match')
  1359. final.append('urls--')
  1360. else:
  1361. print('matched')
  1362. final.append('urls--' + url[0])
  1363. urlfinal.append(url[0])
  1364. except IndexError:
  1365. final.append('urls--')
  1366. print(
  1367. '############################################################# url #############################################################')
  1368. print(url)
  1369. #######organisation and contact################
  1370. # def company_url():
  1371. # # print('--url--')
  1372. # # print(url)
  1373. # try:
  1374. # match = str(url[0]).lower()
  1375. # match =match.replace('.com','').replace('www.','').replace('.in','').replace('.co','').upper()
  1376. # final.append("OrganizationName--" + match)
  1377. # # remove_list.append(match)
  1378. # except IndexError:
  1379. # org_name()
  1380. # organisation()
  1381. # final.append("OrganizationName--")
  1382. # make example sentence
  1383. # print(horizontaltext)
  1384. sentence = Sentence(verticaltext)
  1385. # predict NER tags
  1386. tagger.predict(sentence)
  1387. # print sentence
  1388. ko = (sentence)
  1389. ko1 = str(ko).split("→")
  1390. import pandas as pd
  1391. dfg = []
  1392. try:
  1393. s = ko1[1].replace("", "").replace("", "").replace("/", ":")
  1394. except IndexError:
  1395. os.remove(found)
  1396. return 'Invalid image'
  1397. dfg.append(s)
  1398. df = pd.DataFrame(dfg)
  1399. df = df[0]
  1400. df.to_csv("df.csv", index=False)
  1401. df1 = pd.read_csv("df.csv")
  1402. ve = df1["0"].str.split(",")
  1403. fgf = ve.to_list()
  1404. dfgh = pd.DataFrame(fgf[0])
  1405. maindf = dfgh[0] # .str.split(":")
  1406. # maindf.to_csv("main.csv")
  1407. main1 = maindf.to_list()
  1408. main1
  1409. # cv=pd.DataFrame(ve)
  1410. # cv
  1411. per = ["PER"]
  1412. org = ["ORG"]
  1413. loc = ["LOC"]
  1414. organizations = [i for i in main1 for j in org if j in i]
  1415. PErsons = [i for i in main1 for j in per if j in i]
  1416. location = [i for i in main1 for j in loc if j in i]
  1417. # ************************************* ORGANIZATION ********************************************************************
  1418. try:
  1419. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  1420. '').replace(
  1421. ']', '').replace(
  1422. '.com', '') + " /" + organizations[1].replace(":ORG", "").replace('"', '').replace('.com', ''))) < 4:
  1423. pass
  1424. # company_url()
  1425. else:
  1426. match = str(urlfinal[0]).lower()
  1427. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  1428. 'https',
  1429. '').replace(
  1430. 'http', '').replace(":", "").replace("/", "").upper()
  1431. print(match)
  1432. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']', '').replace(
  1433. '.com', '') + " /" + \
  1434. organizations[1].replace(":ORG", "").replace('"', '').replace('.com', '')
  1435. s1 = s1g.upper()
  1436. s2 = match.upper()
  1437. from difflib import SequenceMatcher
  1438. print(s1)
  1439. print(s2)
  1440. print(SequenceMatcher(None, s1, s2).ratio())
  1441. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  1442. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  1443. final.append(
  1444. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  1445. '').replace(
  1446. '.com', '').replace(']', '') + " /" + organizations[1].replace(":ORG", "").replace('"',
  1447. '').replace(
  1448. '.com', '').replace(']', ''))
  1449. else:
  1450. final.append("OrganizationName--" + s2)
  1451. except IndexError:
  1452. try:
  1453. if len(("OrganizationName--" + organizations[0].replace(":ORG", "").replace('[', '').replace(']',
  1454. '').replace(
  1455. '"',
  1456. '').replace(
  1457. '.com', ''))) < 4:
  1458. pass
  1459. # company_url()
  1460. else:
  1461. match = str(urlfinal[0]).lower()
  1462. match = match.replace('.com', '').replace('www.', '').replace('.in', '').replace('.co', '').replace(
  1463. 'https', '').replace('http', '').replace(":", "").replace("/", "").upper()
  1464. s1g = organizations[0].replace(":ORG", "").replace('"', '').replace('[', '').replace(']',
  1465. '').replace(
  1466. '.com', '')
  1467. s1 = s1g.upper()
  1468. s2 = match.upper()
  1469. from difflib import SequenceMatcher
  1470. print(s1)
  1471. print(s2)
  1472. print(SequenceMatcher(None, s1, s2).ratio())
  1473. if SequenceMatcher(None, s1, s2).ratio() >= 0.10:
  1474. # and SequenceMatcher(None, s1, s2).ratio()<0.50:
  1475. final.append(
  1476. "OrganizationName--" + organizations[0].replace(":ORG", "").replace('"', '').replace('[',
  1477. '').replace(
  1478. ']', '').replace(
  1479. '.com', '').replace(']', ''))
  1480. else:
  1481. final.append("OrganizationName--" + s2)
  1482. except IndexError:
  1483. org_name()
  1484. organisation()
  1485. # final.append("OrganizationName--")
  1486. # ************************************* CONTACT PERSON *******************************************************************
  1487. try:
  1488. final.append(
  1489. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace('"', '').replace("]",
  1490. "") +
  1491. PErsons[
  1492. 1].replace(":PER", "").replace('"', ''))
  1493. except IndexError:
  1494. try:
  1495. final.append(
  1496. "CONTACTPERSONNAME--" + PErsons[0].replace(":PER", "").replace("[", "").replace("]", "").replace(
  1497. '"',
  1498. ''))
  1499. except IndexError:
  1500. org_name()
  1501. contactpersonname()
  1502. # final.append("CONTACTPERSONNAME--")
  1503. ###############address flair#####################
  1504. try:
  1505. print(
  1506. '############################################################# address new code #############################################################')
  1507. loactionlst = ['address', 'factory', 'd.no', 'h.no', 'h. no', 'plot', 'flat', 'plat']
  1508. loclst = [i for i in loactionlst if i in htext.lower()]
  1509. textaddress = htext
  1510. textaddress = textaddress.replace("|", ",")
  1511. textaddress = textaddress.lower()
  1512. nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  1513. grop = nlp(textaddress)
  1514. citycountry = []
  1515. print('########################### city or country name ###########################')
  1516. d = grop[-1]
  1517. if d['entity_group'] == "COUNTRY":
  1518. print(d["word"])
  1519. citycountry.append(d["word"])
  1520. elif d['entity_group'] == "CITY":
  1521. print(d["word"])
  1522. citycountry.append(d["word"])
  1523. try:
  1524. address1 = loclst[0]
  1525. except IndexError:
  1526. address1 = (textaddress.partition(",")[0])
  1527. words = address1.split()
  1528. address1 = words[-1]
  1529. star_location = address1.lower()
  1530. end_location = citycountry[0].replace("#", "")
  1531. start = star_location
  1532. end = end_location
  1533. s = textaddress.lower()
  1534. middle_address = (s.split(start))[-1].split(end)[0]
  1535. Address = start + middle_address + end
  1536. Address = Address.replace('--', '').title()
  1537. print(Address)
  1538. if Address.count(',') < 2:
  1539. splitaddress()
  1540. else:
  1541. final.append('ADDRESS--' + Address)
  1542. # star_location = location[0].replace(":LOC", "").replace('"', '').replace('[', '')
  1543. # end_location = location[-1].replace(":LOC", "").replace('"', '').replace(']', '')
  1544. # d1 = star_location.split()
  1545. # d2 = end_location.split()
  1546. # d3 = d1[0]
  1547. # d4 = d2[0]
  1548. # start = d3
  1549. # end = d4
  1550. # s = horizontaltext
  1551. # middle_address = ((s.split(start))[1].split(end)[0])
  1552. # Address = d3 + middle_address + d4
  1553. # final.append('ADDRESS--' + Address)
  1554. # addrespinlst.append(Address)
  1555. except IndexError:
  1556. splitaddress()
  1557. ########################################## Designation ###########################################
  1558. import re
  1559. new = []
  1560. with open('test.txt', 'r') as f:
  1561. flag = False
  1562. for line in f:
  1563. line1 = line
  1564. line = line.upper()
  1565. matches = re.findall(
  1566. r'''\bAPPRENTICE\b|\bEXECUTIVE\b|\bPROPRIETOR\b|\bPARTNER\b|\bMD\b|\bANALYST\b|\bPRACTITIONER\b|\bCUSTOMER\b|\bCOO\b|\bCOACH\b|\bADMINISTRATIVE\b|\bADMINISTRATOR\b|\bAGENT\b|\bHEAD\b|\bCHIEF\b|\bDIRECTOR\b|\bVICE\b|\bPRESIDENT\b|\bMANAGER\b|\bCOORDINATOR\b|\bCOUNSELOR\b|\bSUPERVISOR\b|\bASSISTANT\b|\bSPECIALIST\b|\bARTIST\b|\bWORKER\b|\bCONSULTANT\b|\bREPRESENTATIVE\b|\bARCHITECT\b|\bSTAFF\b|\bMEMBER\b|\bDEVELOPER\b|\bENGINEER\b|\bEXAMINOR\b|\bDOCTOR\b|\bPROFESSOR\b|\bTEACHER\b|\bLEAD\b|\bOFFICER\b|\bCEO\b|\bC.E.O\b|\bJUNIOR\b|\bSENIOR\b|\bPROFESSOR\b|\bSALES\b''',
  1567. line)
  1568. for match in matches:
  1569. line = line.replace('-', '')
  1570. # print(line)
  1571. o = "Designation--" + line
  1572. new.append(o)
  1573. remove_list.append(str(line1).replace('\n', ''))
  1574. try:
  1575. a = new[0].replace('\n', '')
  1576. final.append(a)
  1577. except IndexError:
  1578. final.append("Designation--")
  1579. ###################################################Phone number#################################################
  1580. num = []
  1581. import phonenumbers
  1582. # print(verticaltext)
  1583. numbers = phonenumbers.PhoneNumberMatcher(
  1584. verticaltext.replace('+91', '').replace('(0)', '').replace('(', '').replace(')', ''), "IN")
  1585. for number in numbers:
  1586. number = str(number).split(")")
  1587. num.append(number[1])
  1588. # num.append(number[-1])
  1589. if len(num) == 0:
  1590. final.append("ContactNumber--")
  1591. final.append("OrganizationNumber--")
  1592. elif len(num) > 1:
  1593. final.append("ContactNumber--" + num[0].replace(' ', ''))
  1594. final.append("OrganizationNumber--" + num[-1].replace(' ', ''))
  1595. elif len(num) == 1:
  1596. try:
  1597. final.append("ContactNumber--" + num[0].replace(' ', ''))
  1598. final.append("OrganizationNumber--")
  1599. except IndexError:
  1600. final.append("ContactNumber--")
  1601. final.append("OrganizationNumber--")
  1602. print(
  1603. '############################################################# num #############################################################')
  1604. print(num)
  1605. # try:
  1606. # final.append("PhoneNumber--" + num[0].replace(' ', ''))
  1607. # remove_list.append(num[0])
  1608. # except IndexError:
  1609. # pass
  1610. # try:
  1611. # final.append("PhoneNumber1--" + num[1].replace(' ', ''))
  1612. # remove_list.append(num[1])
  1613. # except IndexError:
  1614. # pass
  1615. # try:
  1616. # final.append("PhoneNumber2--" + num[2].replace(' ', ''))
  1617. # remove_list.append(num[2])
  1618. # except IndexError:
  1619. # pass
  1620. ################################################### Email######################################################
  1621. import re
  1622. from email_scraper import scrape_emails
  1623. s = list(scrape_emails(horizontaltext))
  1624. email_id = s
  1625. # email_id = []
  1626. # matches = re.findall(r'[\w\.-]+@[\w\.-]+', verticaltext)
  1627. # for match in matches:
  1628. # email_id.append(match)
  1629. # # final.append('Email--' + match)
  1630. # email_ = str(email_id).replace("[", "").replace("]", "").replace("'", "")
  1631. # # final.append(email_)
  1632. # # final.append('Email--' + email_)
  1633. # # remove_list.append(email_)
  1634. if len(email_id) > 1:
  1635. final.append(
  1636. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace("'",
  1637. ""))
  1638. final.append(
  1639. 'OrganizationEmail--' + str(email_id[-1]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  1640. "'",
  1641. ""))
  1642. else:
  1643. try:
  1644. final.append(
  1645. 'ContactEmail--' + str(email_id[0]).replace("[", "").replace("]", "").replace("\\n", "").replace(
  1646. "'",
  1647. ""))
  1648. final.append('OrganizationEmail--')
  1649. except IndexError:
  1650. final.append('ContactEmail--')
  1651. final.append('OrganizationEmail--')
  1652. ###############PINCODE############
  1653. pinlst = []
  1654. print(addrespinlst)
  1655. import pgeocode
  1656. # try:
  1657. # matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', addrespinlst[0])
  1658. # for i in matche1:
  1659. # address3 = i.replace(' ', '').replace('-', '')
  1660. # pinlst.append(address3)
  1661. # except IndexError:
  1662. lst = []
  1663. for i in num:
  1664. i = i[1:]
  1665. lst.append(i)
  1666. infile = r"vtext.txt"
  1667. outfile = r"cleaned_file.txt"
  1668. import glob
  1669. delete_list = lst
  1670. # delete_list = ["firstname1 lastname1","firstname2 lastname2","firstnamen lastnamen",'Director - Sales & Business Development']
  1671. fin = open(infile, "r+")
  1672. fout = open(outfile, "w+")
  1673. for line12 in fin:
  1674. for word in delete_list:
  1675. line12 = line12.replace(word, "")
  1676. fout.write(line12)
  1677. fin.close()
  1678. # print(line)
  1679. # print(addrespinlst)
  1680. import pgeocode
  1681. print(line12)
  1682. import re
  1683. matche1 = re.findall(r'-\d{6}|\b\d{6}\b|\b\d{3} \d{3}\b|-\d{3} \d{3}|\b\d{3} \d{3}\b', line12)
  1684. for i in matche1:
  1685. address3 = i.replace(' ', '').replace('-', '')
  1686. pinlst.append(address3)
  1687. nomi = pgeocode.Nominatim('IN')
  1688. try:
  1689. a = nomi.query_postal_code(str(pinlst[-1]))
  1690. # print(a)
  1691. b = a.keys()
  1692. c = b.values.tolist()
  1693. d = a.tolist()
  1694. postal_code = "PinCode1" + "--" + d[0]
  1695. final.append(postal_code)
  1696. country_code = c[1] + "--" + str(d[1])
  1697. final.append(country_code)
  1698. place_name = 'LandMark1' + "--" + str(d[2])
  1699. final.append(place_name)
  1700. state_name = c[3] + "--" + str(d[3])
  1701. final.append(state_name)
  1702. state_code = c[4] + "--" + str(d[4])
  1703. final.append(state_code)
  1704. county_name = 'CityName1' + "--" + str(d[5])
  1705. final.append(county_name)
  1706. except (IndexError, NameError):
  1707. final.append("PinCode1--")
  1708. final.append("country_code--")
  1709. final.append("LandMark1--")
  1710. final.append("state_name--")
  1711. final.append("state_code--")
  1712. final.append("CityName1--")
  1713. ######################################################## json #####################################################################
  1714. import pandas as pd
  1715. df = pd.DataFrame(final)
  1716. df1 = df[0].str.split('--', expand=True)
  1717. # print(df1)
  1718. df1.rename({df1.columns[-2]: 'Keys'}, axis=1, inplace=True)
  1719. df1.rename({df1.columns[-1]: 'Values'}, axis=1, inplace=True)
  1720. df1['Keys']=df1['Keys'].str.strip()
  1721. df1.to_csv('path123.csv', index=False)
  1722. df2 = pd.read_csv('path123.csv')
  1723. print(df2)
  1724. df2 = df2.T
  1725. df2.to_csv('path1.csv', index=False, header=False)
  1726. df1 = pd.read_csv('path1.csv')
  1727. df1.to_json('firstjson1.json', orient="index")
  1728. import json
  1729. with open('firstjson1.json', 'r') as json_file:
  1730. json_load = json.load(json_file)
  1731. # # url = "https://test.bizgaze.app:8443/apis/v4/bizgaze/integrations/businesscards/create"
  1732. nothing = json.dumps(json_load).replace("]", "").replace("[", "").replace('{"0":', '').replace('}}', '}')
  1733. # # print('--------------------------------------------------------------------------')
  1734. # # print(nothing)
  1735. empty = []
  1736. import base64
  1737. name = found
  1738. image = open(name, 'rb')
  1739. image_read = image.read()
  1740. image_64_encode = base64.b64encode(image_read)
  1741. NULL = 'null'
  1742. empty.append("ByteData--" + (NULL).strip('""'))
  1743. image_64_encode = image_64_encode.decode('utf-8')
  1744. empty.append("FileData--" + str(image_64_encode))
  1745. imagedata = name.split("/")
  1746. imagename = str(imagedata[-1]).replace('"', '').replace("[", "").replace("]", "")
  1747. imagename1 = str(imagename).split('.')
  1748. imagename = str(imagename1[-2]).replace("[", "]")
  1749. empty.append("FileName--" + imagename)
  1750. empty.append("FilePath--" + found)
  1751. imageExtension = str(imagename1[-1]).replace("[", "]")
  1752. empty.append("FileType--" + imageExtension)
  1753. image.close()
  1754. import pandas as pd
  1755. df = pd.DataFrame(empty)
  1756. df = df[0].str.split("--", expand=True)
  1757. data1 = pd.DataFrame(df[0])
  1758. data2 = pd.DataFrame(df[1])
  1759. dt = data2.set_index(data1[0])
  1760. dt4 = dt.T
  1761. dictionary = dt4.to_dict(orient="index")
  1762. list1 = []
  1763. # list.append(a)
  1764. list1.append(dictionary[1])
  1765. # # final.append("image--"+str(dictionary[1]).replace("\'",'"'))
  1766. print('--------------------')
  1767. # print(namelist)
  1768. import json
  1769. # JSON data:
  1770. x = nothing
  1771. # python object to be appended
  1772. y = {"image": dictionary[1]}
  1773. # parsing JSON string:
  1774. z = json.loads(x)
  1775. # appending the data
  1776. z.update(y)
  1777. # the result is a JSON string:
  1778. # print(json.dumps(z))
  1779. #############################################creating csv#####################################
  1780. # print(final)
  1781. # print(imagelist)
  1782. # final.append('image--'+str(imagelist))
  1783. # import requests
  1784. # import json
  1785. # # with open('visitingcard1.json', 'r') as json_file:
  1786. # # json_load = json.load(json_file)
  1787. # url = "https://demo.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create"
  1788. url = "https://test.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create"
  1789. # url = 'https://c01.bizgaze.app/apis/v4/bizgaze/integrations/businesscards/create' # C01
  1790. payload1 = json.dumps(z)
  1791. # print('--------------------------------------------------------------------------')
  1792. # print(payload1)
  1793. headers = {
  1794. 'Authorization': 'stat db226c95fae04943aa3e3c03a4381b2a',
  1795. # 'Authorization': 'stat f7cdb402e01e44e5842878653946168f', # c01
  1796. # 'Authorization': 'Stat c3e11b2fcbfe455b86a1fe6efde02a69',#demo
  1797. 'Content-Type': 'application/json'
  1798. }
  1799. response = requests.request("POST", url, headers=headers, data=payload1)
  1800. # print("##############################################################")
  1801. # #print(payload1)
  1802. print(response.text)
  1803. import os
  1804. if 'BusinessCards Created Successfully' in response.text:
  1805. print('present')
  1806. os.remove(found)
  1807. else:
  1808. print('not present')
  1809. # df1.to_json('visitingcard.json')
  1810. # data = df1.to_json('visiting.json', orient='records')
  1811. # print(data)
  1812. # return render_template('index.html')
  1813. # files = glob.glob('./upload/*')
  1814. # for f in files:
  1815. # os.remove(f)
  1816. # print('Time Taken:',total)
  1817. return response.text
  1818. # return 'done'
  1819. # # return send_file(p,as_attachment=True)
  1820. # @app.route('/upload_BusinessCards', methods=["POST"])
  1821. # def upload_BusinessCards():
  1822. # if __name__ == "__main__":
  1823. # url_list = []
  1824. # Dataset = request.get_json()
  1825. # print("8888888888888888888888888888888888888888888888888888888888888888888888888888888888")
  1826. # #print(Dataset)
  1827. # # id = "100013660000125"
  1828. # url_list.append(Dataset)
  1829. # # multiprocessing
  1830. # with multiprocessing.Pool(processes=1) as pool:
  1831. # # try:
  1832. # results = pool.map(predict, url_list)
  1833. # # except IndexError:
  1834. # # return 'Invalid image'
  1835. # # results.clear()
  1836. # # a=results[0]
  1837. # pool.close()
  1838. # return results[0]
  1839. @app.route('/upload_BusinessCards', methods=["POST"])
  1840. def mainfunction():
  1841. Dataset = request.get_json()
  1842. if len(Dataset)==1:
  1843. # predict(Dataset)
  1844. return predict(Dataset)
  1845. else:
  1846. # multiplecards(Dataset)
  1847. return multiplecards(Dataset)
  1848. if __name__ == "__main__":
  1849. app.run(host='0.0.0.0',port=1112)