暂无描述
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

app.py 8.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. from flask import Flask, render_template, request, redirect, Response, send_file
  2. import glob
  3. from pdf2image import convert_from_path
  4. from PyPDF2 import PdfReader
  5. import pandas as pd
  6. import csv
  7. import os
  8. import cv2
  9. import numpy as np
  10. from paddleocr import PaddleOCR, draw_ocr
  11. import re
  12. #ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
  13. app = Flask(__name__)
  14. @app.route('/pdftotext', methods=["POST"])
  15. def pdftotext():
  16. Dataset = request.get_json()
  17. data = {'visiting': Dataset}
  18. for i in data['visiting']:
  19. import time
  20. #time.sleep(1)
  21. a = i
  22. x = a['FileData']
  23. # print(x)
  24. y = a['FileName']
  25. z = a['FileType']
  26. # CreatedBy=a['CreatedBy']
  27. name = y + '.' + z
  28. print(name)
  29. # print(y)
  30. # image = y.split("/")
  31. # filename=image[-1]
  32. # print(x)
  33. img_data = x.encode()
  34. import base64
  35. with open('./uploads/' + name, "wb") as fh:
  36. fh.write(base64.decodebytes(img_data))
  37. # pdf to images
  38. images = convert_from_path('./uploads/' + name)
  39. for i in range(len(images)):
  40. images[i].save('./images/'+ str(i) +'.jpg', 'JPEG')
  41. #count of a pdf files
  42. reader = PdfReader('./uploads/' + name)
  43. num = len(reader.pages)
  44. print(num)
  45. import pandas as pd
  46. ocr = PaddleOCR(use_gpu=True,use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
  47. dframe = pd.DataFrame({'Name': [], 'Father Name': [], 'Mother Name':[],'Husband Name':[],'Wife Name':[] , 'Others Name':[] ,'House Number': [],'Age': [], 'Gender': [],'Voter Number':[]})
  48. # Assuming 'dframe' is the DataFrame with column names
  49. with open(y+'.csv', 'a', newline='') as csvfile:
  50. writer = csv.DictWriter(csvfile, fieldnames=dframe.columns)
  51. writer.writeheader()
  52. list_of_elements=[]
  53. for i in range(num):
  54. path='./images/'+ str(i)+'.jpg'
  55. print(path)
  56. img = cv2.imread(path)
  57. if img.shape[0] > 1000:
  58. img = cv2.resize(img, (int(img.shape[1]*0.7), int(img.shape[0]*0.7)))
  59. result1 = ocr.ocr(img)
  60. txts1 = [line[1][0] for line in result1[0]]
  61. #print(txts1)
  62. for i in txts1:
  63. if 'Section' in i:
  64. street = (i.split(':')[-1])
  65. gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  66. ret, thresh = cv2.threshold(gray, 250, 300, cv2.THRESH_BINARY)
  67. contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
  68. import pandas as pd
  69. min_size = 9000
  70. max_size = 100000
  71. # Sort contours based on y-coordinate and then x-coordinate
  72. contours = sorted(contours, key=lambda contour: (cv2.boundingRect(contour)[1], cv2.boundingRect(contour)[0]))
  73. # Extract text from each contour
  74. for contour in contours:
  75. x, y, w, h = cv2.boundingRect(contour)
  76. box_size = w * h
  77. if box_size >= min_size and box_size <= max_size:
  78. # Draw bounding box around contour
  79. cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
  80. # for contour in contours:
  81. # x, y, w, h = cv2.boundingRect(contour)
  82. # box_size = w * h
  83. # if box_size >= min_size and box_size <= max_size:
  84. # cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 1)
  85. # cv2.putText(img, "Box", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
  86. box = img[y:y + h, x:x + w]
  87. #cv2_imshow(box)
  88. result = ocr.ocr(box)
  89. try:
  90. txts = [line[1][0] for line in result[0]]
  91. txts.remove('Photo is')
  92. txts.remove('Available')
  93. age = [re.findall(r"\d{2}", i)[0] for i in txts if "Age" in i and re.findall(r"\d{2}", i)]
  94. voter_ID = [element for element in txts if
  95. element.isalnum() and not element.isalpha() and not element.isdigit()]
  96. all_data = [item.lower().replace('name', 'Name:').replace('number', 'Number:').replace("gender", "',Gender:'")
  97. for item in txts]
  98. all_data[0] = ":" + all_data[0]
  99. all_data[1] = ":" + all_data[1]
  100. for i in range(1, len(all_data)):
  101. if ":" not in all_data[i]:
  102. all_data[i - 1] += " " + all_data[i]
  103. all_data[i] = ""
  104. # Remove empty elements from the list
  105. my_list = [element for element in all_data if element != ""]
  106. # my_list = [all_data[i-1] + " " + all_data[i] if ":" not in all_data[i] else all_data[i] for i in range(0, len(all_data)) if all_data[i] != ""]
  107. my_list = [my_list[i][1:] if my_list[i].startswith(":") else my_list[i] for i in range(len(my_list))]
  108. my_list = [s.replace('::', ':') for s in my_list]
  109. list_of_elements.append("Age: " + age[0])
  110. list_of_elements.append("Voter Number: " + voter_ID[0])
  111. for i in my_list:
  112. if i.startswith("Name"):
  113. person_name = i.split(":")[-1].upper()
  114. list_of_elements.append("Name: " + person_name)
  115. elif "father" in i:
  116. fathername = i.split(":")[-1].upper()
  117. list_of_elements.append("Father Name: " + fathername)
  118. elif "husband" in i:
  119. husband_name = i.split(":")[-1].upper()
  120. list_of_elements.append("Husband Name: " + husband_name)
  121. elif "mother" in i:
  122. mother_name = i.split(":")[-1].upper()
  123. list_of_elements.append("Mother Name: " + mother_name)
  124. elif "Gender" in i:
  125. gender = i.split(":")[-1].upper()
  126. list_of_elements.append("Gender: " + gender)
  127. elif "house" in i:
  128. house = i.split(":")[-1].upper()
  129. list_of_elements.append("House Number: " + house + ',' + street)
  130. elif "wife" in i:
  131. wife = i.split(":")[-1].upper()
  132. list_of_elements.append("Wife Name: " + wife)
  133. elif "other" in i:
  134. other = i.split(":")[-1].upper()
  135. list_of_elements.append("Others Name: " + other)
  136. df = pd.DataFrame(list_of_elements)
  137. #df=df.T
  138. data = dict([val.split(': ') for val in df[0]])
  139. list_of_elements.clear()
  140. print(data)
  141. writer.writerow(data)
  142. csvfile.flush()
  143. except (IndexError,ValueError):
  144. pass
  145. files = glob.glob('D:\\text_extraction\\uploads\\*')
  146. for f in files:
  147. os.remove(f)
  148. files = glob.glob('D:\\text_extraction\\images\\*')
  149. for f in files:
  150. os.remove(f)
  151. return 'done'
  152. if __name__ == "__main__":
  153. app.run(host='0.0.0.0',port=1112)