123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200 |
- from flask import Flask, render_template, request, redirect, Response, send_file
- import glob
- from pdf2image import convert_from_path
- from PyPDF2 import PdfReader
-
- import pandas as pd
- import csv
- import os
- import cv2
- import numpy as np
- from paddleocr import PaddleOCR, draw_ocr
- import re
- #ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
-
- app = Flask(__name__)
-
- @app.route('/pdftotext', methods=["POST"])
- def pdftotext():
-
- Dataset = request.get_json()
-
- data = {'visiting': Dataset}
- for i in data['visiting']:
- import time
- #time.sleep(1)
- a = i
- x = a['FileData']
- # print(x)
- y = a['FileName']
- z = a['FileType']
- # CreatedBy=a['CreatedBy']
-
- name = y + '.' + z
- print(name)
- # print(y)
- # image = y.split("/")
- # filename=image[-1]
-
- # print(x)
- img_data = x.encode()
-
- import base64
- with open('./uploads/' + name, "wb") as fh:
- fh.write(base64.decodebytes(img_data))
-
-
-
-
- # pdf to images
- images = convert_from_path('./uploads/' + name)
-
- for i in range(len(images)):
- images[i].save('./images/'+ str(i) +'.jpg', 'JPEG')
-
-
- #count of a pdf files
- reader = PdfReader('./uploads/' + name)
- num = len(reader.pages)
- print(num)
-
-
- import pandas as pd
- ocr = PaddleOCR(use_gpu=True,use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
- dframe = pd.DataFrame({'Name': [], 'Father Name': [], 'Mother Name':[],'Husband Name':[],'Wife Name':[] , 'Others Name':[] ,'House Number': [],'Age': [], 'Gender': [],'Voter Number':[]})
-
-
-
- # Assuming 'dframe' is the DataFrame with column names
- with open(y+'.csv', 'a', newline='') as csvfile:
- writer = csv.DictWriter(csvfile, fieldnames=dframe.columns)
- writer.writeheader()
-
- list_of_elements=[]
-
- for i in range(num):
- path='./images/'+ str(i)+'.jpg'
- print(path)
-
- img = cv2.imread(path)
- if img.shape[0] > 1000:
- img = cv2.resize(img, (int(img.shape[1]*0.7), int(img.shape[0]*0.7)))
- result1 = ocr.ocr(img)
- txts1 = [line[1][0] for line in result1[0]]
- #print(txts1)
-
- for i in txts1:
- if 'Section' in i:
- street = (i.split(':')[-1])
-
-
-
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- ret, thresh = cv2.threshold(gray, 250, 300, cv2.THRESH_BINARY)
- contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
- import pandas as pd
- min_size = 9000
- max_size = 100000
-
- # Sort contours based on y-coordinate and then x-coordinate
- contours = sorted(contours, key=lambda contour: (cv2.boundingRect(contour)[1], cv2.boundingRect(contour)[0]))
-
- # Extract text from each contour
- for contour in contours:
- x, y, w, h = cv2.boundingRect(contour)
- box_size = w * h
- if box_size >= min_size and box_size <= max_size:
- # Draw bounding box around contour
- cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
-
- # for contour in contours:
-
- # x, y, w, h = cv2.boundingRect(contour)
- # box_size = w * h
- # if box_size >= min_size and box_size <= max_size:
- # cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 1)
- # cv2.putText(img, "Box", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
- box = img[y:y + h, x:x + w]
- #cv2_imshow(box)
- result = ocr.ocr(box)
- try:
- txts = [line[1][0] for line in result[0]]
-
- txts.remove('Photo is')
- txts.remove('Available')
-
- age = [re.findall(r"\d{2}", i)[0] for i in txts if "Age" in i and re.findall(r"\d{2}", i)]
- voter_ID = [element for element in txts if
- element.isalnum() and not element.isalpha() and not element.isdigit()]
- all_data = [item.lower().replace('name', 'Name:').replace('number', 'Number:').replace("gender", "',Gender:'")
- for item in txts]
-
- all_data[0] = ":" + all_data[0]
- all_data[1] = ":" + all_data[1]
- for i in range(1, len(all_data)):
- if ":" not in all_data[i]:
- all_data[i - 1] += " " + all_data[i]
- all_data[i] = ""
-
- # Remove empty elements from the list
- my_list = [element for element in all_data if element != ""]
- # my_list = [all_data[i-1] + " " + all_data[i] if ":" not in all_data[i] else all_data[i] for i in range(0, len(all_data)) if all_data[i] != ""]
- my_list = [my_list[i][1:] if my_list[i].startswith(":") else my_list[i] for i in range(len(my_list))]
- my_list = [s.replace('::', ':') for s in my_list]
-
- list_of_elements.append("Age: " + age[0])
- list_of_elements.append("Voter Number: " + voter_ID[0])
-
- for i in my_list:
- if i.startswith("Name"):
- person_name = i.split(":")[-1].upper()
- list_of_elements.append("Name: " + person_name)
- elif "father" in i:
- fathername = i.split(":")[-1].upper()
- list_of_elements.append("Father Name: " + fathername)
- elif "husband" in i:
- husband_name = i.split(":")[-1].upper()
- list_of_elements.append("Husband Name: " + husband_name)
- elif "mother" in i:
- mother_name = i.split(":")[-1].upper()
- list_of_elements.append("Mother Name: " + mother_name)
- elif "Gender" in i:
- gender = i.split(":")[-1].upper()
- list_of_elements.append("Gender: " + gender)
- elif "house" in i:
- house = i.split(":")[-1].upper()
- list_of_elements.append("House Number: " + house + ',' + street)
- elif "wife" in i:
- wife = i.split(":")[-1].upper()
- list_of_elements.append("Wife Name: " + wife)
- elif "other" in i:
- other = i.split(":")[-1].upper()
- list_of_elements.append("Others Name: " + other)
-
- df = pd.DataFrame(list_of_elements)
- #df=df.T
- data = dict([val.split(': ') for val in df[0]])
- list_of_elements.clear()
- print(data)
- writer.writerow(data)
- csvfile.flush()
- except (IndexError,ValueError):
- pass
-
-
- files = glob.glob('D:\\text_extraction\\uploads\\*')
- for f in files:
- os.remove(f)
-
- files = glob.glob('D:\\text_extraction\\images\\*')
- for f in files:
- os.remove(f)
-
-
-
- return 'done'
-
-
-
- if __name__ == "__main__":
- app.run(host='0.0.0.0',port=1112)
|