from flask import Flask, render_template, request, redirect, Response, send_file import glob from pdf2image import convert_from_path from PyPDF2 import PdfReader import pandas as pd import csv import os import cv2 import numpy as np from paddleocr import PaddleOCR, draw_ocr import re #ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True) app = Flask(__name__) @app.route('/pdftotext', methods=["POST"]) def pdftotext(): Dataset = request.get_json() data = {'visiting': Dataset} for i in data['visiting']: import time #time.sleep(1) a = i x = a['FileData'] # print(x) y = a['FileName'] z = a['FileType'] # CreatedBy=a['CreatedBy'] name = y + '.' + z print(name) # print(y) # image = y.split("/") # filename=image[-1] # print(x) img_data = x.encode() import base64 with open('./uploads/' + name, "wb") as fh: fh.write(base64.decodebytes(img_data)) # pdf to images images = convert_from_path('./uploads/' + name) for i in range(len(images)): images[i].save('./images/'+ str(i) +'.jpg', 'JPEG') #count of a pdf files reader = PdfReader('./uploads/' + name) num = len(reader.pages) print(num) import pandas as pd ocr = PaddleOCR(use_gpu=True,use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True) dframe = pd.DataFrame({'Name': [], 'Father Name': [], 'Mother Name':[],'Husband Name':[],'Wife Name':[] , 'Others Name':[] ,'House Number': [],'Age': [], 'Gender': [],'Voter Number':[]}) # Assuming 'dframe' is the DataFrame with column names with open(y+'.csv', 'a', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=dframe.columns) writer.writeheader() list_of_elements=[] for i in range(num): path='./images/'+ str(i)+'.jpg' print(path) img = cv2.imread(path) if img.shape[0] > 1000: img = cv2.resize(img, (int(img.shape[1]*0.7), int(img.shape[0]*0.7))) result1 = ocr.ocr(img) txts1 = [line[1][0] for line in result1[0]] #print(txts1) for i in txts1: if 'Section' in i: street = (i.split(':')[-1]) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) ret, thresh = cv2.threshold(gray, 250, 300, cv2.THRESH_BINARY) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) import pandas as pd min_size = 9000 max_size = 100000 # Sort contours based on y-coordinate and then x-coordinate contours = sorted(contours, key=lambda contour: (cv2.boundingRect(contour)[1], cv2.boundingRect(contour)[0])) # Extract text from each contour for contour in contours: x, y, w, h = cv2.boundingRect(contour) box_size = w * h if box_size >= min_size and box_size <= max_size: # Draw bounding box around contour cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2) # for contour in contours: # x, y, w, h = cv2.boundingRect(contour) # box_size = w * h # if box_size >= min_size and box_size <= max_size: # cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 1) # cv2.putText(img, "Box", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2) box = img[y:y + h, x:x + w] #cv2_imshow(box) result = ocr.ocr(box) try: txts = [line[1][0] for line in result[0]] txts.remove('Photo is') txts.remove('Available') age = [re.findall(r"\d{2}", i)[0] for i in txts if "Age" in i and re.findall(r"\d{2}", i)] voter_ID = [element for element in txts if element.isalnum() and not element.isalpha() and not element.isdigit()] all_data = [item.lower().replace('name', 'Name:').replace('number', 'Number:').replace("gender", "',Gender:'") for item in txts] all_data[0] = ":" + all_data[0] all_data[1] = ":" + all_data[1] for i in range(1, len(all_data)): if ":" not in all_data[i]: all_data[i - 1] += " " + all_data[i] all_data[i] = "" # Remove empty elements from the list my_list = [element for element in all_data if element != ""] # my_list = [all_data[i-1] + " " + all_data[i] if ":" not in all_data[i] else all_data[i] for i in range(0, len(all_data)) if all_data[i] != ""] my_list = [my_list[i][1:] if my_list[i].startswith(":") else my_list[i] for i in range(len(my_list))] my_list = [s.replace('::', ':') for s in my_list] list_of_elements.append("Age: " + age[0]) list_of_elements.append("Voter Number: " + voter_ID[0]) for i in my_list: if i.startswith("Name"): person_name = i.split(":")[-1].upper() list_of_elements.append("Name: " + person_name) elif "father" in i: fathername = i.split(":")[-1].upper() list_of_elements.append("Father Name: " + fathername) elif "husband" in i: husband_name = i.split(":")[-1].upper() list_of_elements.append("Husband Name: " + husband_name) elif "mother" in i: mother_name = i.split(":")[-1].upper() list_of_elements.append("Mother Name: " + mother_name) elif "Gender" in i: gender = i.split(":")[-1].upper() list_of_elements.append("Gender: " + gender) elif "house" in i: house = i.split(":")[-1].upper() list_of_elements.append("House Number: " + house + ',' + street) elif "wife" in i: wife = i.split(":")[-1].upper() list_of_elements.append("Wife Name: " + wife) elif "other" in i: other = i.split(":")[-1].upper() list_of_elements.append("Others Name: " + other) df = pd.DataFrame(list_of_elements) #df=df.T data = dict([val.split(': ') for val in df[0]]) list_of_elements.clear() print(data) writer.writerow(data) csvfile.flush() except (IndexError,ValueError): pass files = glob.glob('D:\\text_extraction\\uploads\\*') for f in files: os.remove(f) files = glob.glob('D:\\text_extraction\\images\\*') for f in files: os.remove(f) return 'done' if __name__ == "__main__": app.run(host='0.0.0.0',port=1112)