2 年之前 · f0d96c99ba
--- a/Electrol/app.py
+++ b/Electrol/app.py
@@ -0,0 +1,200 @@
 
				
				+from flask import Flask, render_template, request, redirect, Response, send_file

			
 
				
				+import glob

			
 
				
				+from pdf2image import convert_from_path

			
 
				
				+from PyPDF2 import PdfReader

			
 
				
				+

			
 
				
				+import pandas as pd

			
 
				
				+import csv

			
 
				
				+import os

			
 
				
				+import cv2

			
 
				
				+import numpy as np

			
 
				
				+from paddleocr import PaddleOCR, draw_ocr

			
 
				
				+import re

			
 
				
				+#ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)

			
 
				
				+

			
 
				
				+app = Flask(__name__)

			
 
				
				+

			
 
				
				+@app.route('/pdftotext', methods=["POST"])

			
 
				
				+def pdftotext():

			
 
				
				+

			
 
				
				+    Dataset = request.get_json()

			
 
				
				+

			
 
				
				+    data = {'visiting': Dataset}

			
 
				
				+    for i in data['visiting']:

			
 
				
				+        import time

			
 
				
				+        #time.sleep(1)

			
 
				
				+        a = i

			
 
				
				+        x = a['FileData']

			
 
				
				+        # print(x)

			
 
				
				+        y = a['FileName']

			
 
				
				+        z = a['FileType']

			
 
				
				+        # CreatedBy=a['CreatedBy']

			
 
				
				+

			
 
				
				+        name = y + '.' + z

			
 
				
				+        print(name)

			
 
				
				+        # print(y)

			
 
				
				+        # image = y.split("/")

			
 
				
				+        # filename=image[-1]

			
 
				
				+

			
 
				
				+        # print(x)

			
 
				
				+        img_data = x.encode()

			
 
				
				+

			
 
				
				+        import base64

			
 
				
				+        with open('./uploads/' + name, "wb") as fh:

			
 
				
				+            fh.write(base64.decodebytes(img_data))

			
 
				
				+

			
 
				
				+

			
 
				
				+

			
 
				
				+

			
 
				
				+       # pdf to images

			
 
				
				+        images = convert_from_path('./uploads/' + name)

			
 
				
				+        

			
 
				
				+        for i in range(len(images)):

			
 
				
				+            images[i].save('./images/'+ str(i) +'.jpg', 'JPEG')

			
 
				
				+        

			
 
				
				+        

			
 
				
				+        #count of a pdf files

			
 
				
				+        reader = PdfReader('./uploads/' + name)

			
 
				
				+        num = len(reader.pages)

			
 
				
				+        print(num)

			
 
				
				+        

			
 
				
				+        

			
 
				
				+        import pandas as pd

			
 
				
				+        ocr = PaddleOCR(use_gpu=True,use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)

			
 
				
				+        dframe = pd.DataFrame({'Name': [], 'Father Name': [], 'Mother Name':[],'Husband Name':[],'Wife Name':[] , 'Others Name':[] ,'House Number': [],'Age': [], 'Gender': [],'Voter Number':[]})

			
 
				
				+        

			
 
				
				+        

			
 
				
				+        

			
 
				
				+        # Assuming 'dframe' is the DataFrame with column names

			
 
				
				+        with open(y+'.csv', 'a', newline='') as csvfile:

			
 
				
				+            writer = csv.DictWriter(csvfile, fieldnames=dframe.columns)

			
 
				
				+            writer.writeheader()

			
 
				
				+        

			
 
				
				+            list_of_elements=[]

			
 
				
				+        

			
 
				
				+            for i in range(num):

			
 
				
				+              path='./images/'+ str(i)+'.jpg'

			
 
				
				+              print(path)

			
 
				
				+        

			
 
				
				+              img = cv2.imread(path)

			
 
				
				+              if img.shape[0] > 1000:

			
 
				
				+                img = cv2.resize(img, (int(img.shape[1]*0.7), int(img.shape[0]*0.7)))  

			
 
				
				+              result1 = ocr.ocr(img)

			
 
				
				+              txts1 = [line[1][0] for line in result1[0]]

			
 
				
				+              #print(txts1)

			
 
				
				+        

			
 
				
				+              for i in txts1:

			
 
				
				+                  if 'Section' in i:

			
 
				
				+                      street = (i.split(':')[-1])

			
 
				
				+        

			
 
				
				+        

			
 
				
				+        

			
 
				
				+              gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

			
 
				
				+              ret, thresh = cv2.threshold(gray, 250, 300, cv2.THRESH_BINARY)

			
 
				
				+              contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

			
 
				
				+              import pandas as pd

			
 
				
				+              min_size = 9000

			
 
				
				+              max_size = 100000

			
 
				
				+        

			
 
				
				+              # Sort contours based on y-coordinate and then x-coordinate

			
 
				
				+              contours = sorted(contours, key=lambda contour: (cv2.boundingRect(contour)[1], cv2.boundingRect(contour)[0]))

			
 
				
				+        

			
 
				
				+              # Extract text from each contour

			
 
				
				+              for contour in contours:

			
 
				
				+                  x, y, w, h = cv2.boundingRect(contour)

			
 
				
				+                  box_size = w * h

			
 
				
				+                  if box_size >= min_size and box_size <= max_size:

			
 
				
				+                      # Draw bounding box around contour

			
 
				
				+                      cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

			
 
				
				+        

			
 
				
				+                      # for contour in contours:

			
 
				
				+        

			
 
				
				+                      #     x, y, w, h = cv2.boundingRect(contour)

			
 
				
				+                      #     box_size = w * h

			
 
				
				+                      #     if box_size >= min_size and box_size <= max_size:

			
 
				
				+                      #       cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 1)

			
 
				
				+                      # cv2.putText(img, "Box", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

			
 
				
				+                      box = img[y:y + h, x:x + w]

			
 
				
				+                      #cv2_imshow(box)

			
 
				
				+                      result = ocr.ocr(box)

			
 
				
				+                      try:

			
 
				
				+                          txts = [line[1][0] for line in result[0]]

			
 
				
				+        

			
 
				
				+                          txts.remove('Photo is')

			
 
				
				+                          txts.remove('Available')

			
 
				
				+        

			
 
				
				+                          age = [re.findall(r"\d{2}", i)[0] for i in txts if "Age" in i and re.findall(r"\d{2}", i)]

			
 
				
				+                          voter_ID = [element for element in txts if

			
 
				
				+                                      element.isalnum() and not element.isalpha() and not element.isdigit()]

			
 
				
				+                          all_data = [item.lower().replace('name', 'Name:').replace('number', 'Number:').replace("gender", "',Gender:'")

			
 
				
				+                                  for item in txts]

			
 
				
				+        

			
 
				
				+                          all_data[0] = ":" + all_data[0]

			
 
				
				+                          all_data[1] = ":" + all_data[1]

			
 
				
				+                          for i in range(1, len(all_data)):

			
 
				
				+                              if ":" not in all_data[i]:

			
 
				
				+                                  all_data[i - 1] += " " + all_data[i]

			
 
				
				+                                  all_data[i] = ""

			
 
				
				+        

			
 
				
				+                              # Remove empty elements from the list

			
 
				
				+                          my_list = [element for element in all_data if element != ""]

			
 
				
				+                          # my_list = [all_data[i-1] + " " + all_data[i] if ":" not in all_data[i] else all_data[i] for i in range(0, len(all_data)) if all_data[i] != ""]

			
 
				
				+                          my_list = [my_list[i][1:] if my_list[i].startswith(":") else my_list[i] for i in range(len(my_list))]

			
 
				
				+                          my_list = [s.replace('::', ':') for s in my_list]

			
 
				
				+        

			
 
				
				+                          list_of_elements.append("Age: " + age[0])

			
 
				
				+                          list_of_elements.append("Voter Number: " + voter_ID[0])

			
 
				
				+        

			
 
				
				+                          for i in my_list:

			
 
				
				+                              if i.startswith("Name"):

			
 
				
				+                                  person_name = i.split(":")[-1].upper()

			
 
				
				+                                  list_of_elements.append("Name: " + person_name)

			
 
				
				+                              elif "father" in i:

			
 
				
				+                                  fathername = i.split(":")[-1].upper()

			
 
				
				+                                  list_of_elements.append("Father Name: " + fathername)

			
 
				
				+                              elif "husband" in i:

			
 
				
				+                                  husband_name = i.split(":")[-1].upper()

			
 
				
				+                                  list_of_elements.append("Husband Name: " + husband_name)

			
 
				
				+                              elif "mother" in i:

			
 
				
				+                                  mother_name = i.split(":")[-1].upper()

			
 
				
				+                                  list_of_elements.append("Mother Name: " + mother_name)

			
 
				
				+                              elif "Gender" in i:

			
 
				
				+                                  gender = i.split(":")[-1].upper()

			
 
				
				+                                  list_of_elements.append("Gender: " + gender)

			
 
				
				+                              elif "house" in i:

			
 
				
				+                                  house = i.split(":")[-1].upper()

			
 
				
				+                                  list_of_elements.append("House Number: " + house + ',' + street)

			
 
				
				+                              elif "wife" in i:

			
 
				
				+                                  wife = i.split(":")[-1].upper()

			
 
				
				+                                  list_of_elements.append("Wife Name: " + wife)

			
 
				
				+                              elif "other" in i:

			
 
				
				+                                  other = i.split(":")[-1].upper()

			
 
				
				+                                  list_of_elements.append("Others Name: " + other)

			
 
				
				+        

			
 
				
				+                          df = pd.DataFrame(list_of_elements)

			
 
				
				+                          #df=df.T

			
 
				
				+                          data = dict([val.split(': ') for val in df[0]])

			
 
				
				+                          list_of_elements.clear()

			
 
				
				+                          print(data)

			
 
				
				+                          writer.writerow(data)

			
 
				
				+                          csvfile.flush()

			
 
				
				+                      except (IndexError,ValueError):

			
 
				
				+                        pass

			
 
				
				+    

			
 
				
				+

			
 
				
				+        files = glob.glob('D:\\text_extraction\\uploads\\*')

			
 
				
				+        for f in files:

			
 
				
				+          os.remove(f)

			
 
				
				+

			
 
				
				+        files = glob.glob('D:\\text_extraction\\images\\*')

			
 
				
				+        for f in files:

			
 
				
				+          os.remove(f)

			
 
				
				+

			
 
				
				+

			
 
				
				+

			
 
				
				+    return 'done'   

			
 
				
				+        

			
 
				
				+

			
 
				
				+

			
 
				
				+if __name__ == "__main__":

			
 
				
				+    app.run(host='0.0.0.0',port=1112)