diff --git a/Electrol/app.py b/Electrol/app.py new file mode 100644 index 0000000..7130048 --- /dev/null +++ b/Electrol/app.py @@ -0,0 +1,200 @@ +from flask import Flask, render_template, request, redirect, Response, send_file +import glob +from pdf2image import convert_from_path +from PyPDF2 import PdfReader + +import pandas as pd +import csv +import os +import cv2 +import numpy as np +from paddleocr import PaddleOCR, draw_ocr +import re +#ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True) + +app = Flask(__name__) + +@app.route('/pdftotext', methods=["POST"]) +def pdftotext(): + + Dataset = request.get_json() + + data = {'visiting': Dataset} + for i in data['visiting']: + import time + #time.sleep(1) + a = i + x = a['FileData'] + # print(x) + y = a['FileName'] + z = a['FileType'] + # CreatedBy=a['CreatedBy'] + + name = y + '.' + z + print(name) + # print(y) + # image = y.split("/") + # filename=image[-1] + + # print(x) + img_data = x.encode() + + import base64 + with open('./uploads/' + name, "wb") as fh: + fh.write(base64.decodebytes(img_data)) + + + + + # pdf to images + images = convert_from_path('./uploads/' + name) + + for i in range(len(images)): + images[i].save('./images/'+ str(i) +'.jpg', 'JPEG') + + + #count of a pdf files + reader = PdfReader('./uploads/' + name) + num = len(reader.pages) + print(num) + + + import pandas as pd + ocr = PaddleOCR(use_gpu=True,use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True) + dframe = pd.DataFrame({'Name': [], 'Father Name': [], 'Mother Name':[],'Husband Name':[],'Wife Name':[] , 'Others Name':[] ,'House Number': [],'Age': [], 'Gender': [],'Voter Number':[]}) + + + + # Assuming 'dframe' is the DataFrame with column names + with open(y+'.csv', 'a', newline='') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=dframe.columns) + writer.writeheader() + + list_of_elements=[] + + for i in range(num): + path='./images/'+ str(i)+'.jpg' + print(path) + + img = cv2.imread(path) + if img.shape[0] > 1000: + img = cv2.resize(img, (int(img.shape[1]*0.7), int(img.shape[0]*0.7))) + result1 = ocr.ocr(img) + txts1 = [line[1][0] for line in result1[0]] + #print(txts1) + + for i in txts1: + if 'Section' in i: + street = (i.split(':')[-1]) + + + + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(gray, 250, 300, cv2.THRESH_BINARY) + contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + import pandas as pd + min_size = 9000 + max_size = 100000 + + # Sort contours based on y-coordinate and then x-coordinate + contours = sorted(contours, key=lambda contour: (cv2.boundingRect(contour)[1], cv2.boundingRect(contour)[0])) + + # Extract text from each contour + for contour in contours: + x, y, w, h = cv2.boundingRect(contour) + box_size = w * h + if box_size >= min_size and box_size <= max_size: + # Draw bounding box around contour + cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2) + + # for contour in contours: + + # x, y, w, h = cv2.boundingRect(contour) + # box_size = w * h + # if box_size >= min_size and box_size <= max_size: + # cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 1) + # cv2.putText(img, "Box", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2) + box = img[y:y + h, x:x + w] + #cv2_imshow(box) + result = ocr.ocr(box) + try: + txts = [line[1][0] for line in result[0]] + + txts.remove('Photo is') + txts.remove('Available') + + age = [re.findall(r"\d{2}", i)[0] for i in txts if "Age" in i and re.findall(r"\d{2}", i)] + voter_ID = [element for element in txts if + element.isalnum() and not element.isalpha() and not element.isdigit()] + all_data = [item.lower().replace('name', 'Name:').replace('number', 'Number:').replace("gender", "',Gender:'") + for item in txts] + + all_data[0] = ":" + all_data[0] + all_data[1] = ":" + all_data[1] + for i in range(1, len(all_data)): + if ":" not in all_data[i]: + all_data[i - 1] += " " + all_data[i] + all_data[i] = "" + + # Remove empty elements from the list + my_list = [element for element in all_data if element != ""] + # my_list = [all_data[i-1] + " " + all_data[i] if ":" not in all_data[i] else all_data[i] for i in range(0, len(all_data)) if all_data[i] != ""] + my_list = [my_list[i][1:] if my_list[i].startswith(":") else my_list[i] for i in range(len(my_list))] + my_list = [s.replace('::', ':') for s in my_list] + + list_of_elements.append("Age: " + age[0]) + list_of_elements.append("Voter Number: " + voter_ID[0]) + + for i in my_list: + if i.startswith("Name"): + person_name = i.split(":")[-1].upper() + list_of_elements.append("Name: " + person_name) + elif "father" in i: + fathername = i.split(":")[-1].upper() + list_of_elements.append("Father Name: " + fathername) + elif "husband" in i: + husband_name = i.split(":")[-1].upper() + list_of_elements.append("Husband Name: " + husband_name) + elif "mother" in i: + mother_name = i.split(":")[-1].upper() + list_of_elements.append("Mother Name: " + mother_name) + elif "Gender" in i: + gender = i.split(":")[-1].upper() + list_of_elements.append("Gender: " + gender) + elif "house" in i: + house = i.split(":")[-1].upper() + list_of_elements.append("House Number: " + house + ',' + street) + elif "wife" in i: + wife = i.split(":")[-1].upper() + list_of_elements.append("Wife Name: " + wife) + elif "other" in i: + other = i.split(":")[-1].upper() + list_of_elements.append("Others Name: " + other) + + df = pd.DataFrame(list_of_elements) + #df=df.T + data = dict([val.split(': ') for val in df[0]]) + list_of_elements.clear() + print(data) + writer.writerow(data) + csvfile.flush() + except (IndexError,ValueError): + pass + + + files = glob.glob('D:\\text_extraction\\uploads\\*') + for f in files: + os.remove(f) + + files = glob.glob('D:\\text_extraction\\images\\*') + for f in files: + os.remove(f) + + + + return 'done' + + + +if __name__ == "__main__": + app.run(host='0.0.0.0',port=1112) \ No newline at end of file