Upload files to 'Electrol'
This commit is contained in:
+200
@@ -0,0 +1,200 @@
|
|||||||
|
from flask import Flask, render_template, request, redirect, Response, send_file
|
||||||
|
import glob
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
from paddleocr import PaddleOCR, draw_ocr
|
||||||
|
import re
|
||||||
|
#ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route('/pdftotext', methods=["POST"])
|
||||||
|
def pdftotext():
|
||||||
|
|
||||||
|
Dataset = request.get_json()
|
||||||
|
|
||||||
|
data = {'visiting': Dataset}
|
||||||
|
for i in data['visiting']:
|
||||||
|
import time
|
||||||
|
#time.sleep(1)
|
||||||
|
a = i
|
||||||
|
x = a['FileData']
|
||||||
|
# print(x)
|
||||||
|
y = a['FileName']
|
||||||
|
z = a['FileType']
|
||||||
|
# CreatedBy=a['CreatedBy']
|
||||||
|
|
||||||
|
name = y + '.' + z
|
||||||
|
print(name)
|
||||||
|
# print(y)
|
||||||
|
# image = y.split("/")
|
||||||
|
# filename=image[-1]
|
||||||
|
|
||||||
|
# print(x)
|
||||||
|
img_data = x.encode()
|
||||||
|
|
||||||
|
import base64
|
||||||
|
with open('./uploads/' + name, "wb") as fh:
|
||||||
|
fh.write(base64.decodebytes(img_data))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# pdf to images
|
||||||
|
images = convert_from_path('./uploads/' + name)
|
||||||
|
|
||||||
|
for i in range(len(images)):
|
||||||
|
images[i].save('./images/'+ str(i) +'.jpg', 'JPEG')
|
||||||
|
|
||||||
|
|
||||||
|
#count of a pdf files
|
||||||
|
reader = PdfReader('./uploads/' + name)
|
||||||
|
num = len(reader.pages)
|
||||||
|
print(num)
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
ocr = PaddleOCR(use_gpu=True,use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
|
||||||
|
dframe = pd.DataFrame({'Name': [], 'Father Name': [], 'Mother Name':[],'Husband Name':[],'Wife Name':[] , 'Others Name':[] ,'House Number': [],'Age': [], 'Gender': [],'Voter Number':[]})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Assuming 'dframe' is the DataFrame with column names
|
||||||
|
with open(y+'.csv', 'a', newline='') as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=dframe.columns)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
list_of_elements=[]
|
||||||
|
|
||||||
|
for i in range(num):
|
||||||
|
path='./images/'+ str(i)+'.jpg'
|
||||||
|
print(path)
|
||||||
|
|
||||||
|
img = cv2.imread(path)
|
||||||
|
if img.shape[0] > 1000:
|
||||||
|
img = cv2.resize(img, (int(img.shape[1]*0.7), int(img.shape[0]*0.7)))
|
||||||
|
result1 = ocr.ocr(img)
|
||||||
|
txts1 = [line[1][0] for line in result1[0]]
|
||||||
|
#print(txts1)
|
||||||
|
|
||||||
|
for i in txts1:
|
||||||
|
if 'Section' in i:
|
||||||
|
street = (i.split(':')[-1])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
ret, thresh = cv2.threshold(gray, 250, 300, cv2.THRESH_BINARY)
|
||||||
|
contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
import pandas as pd
|
||||||
|
min_size = 9000
|
||||||
|
max_size = 100000
|
||||||
|
|
||||||
|
# Sort contours based on y-coordinate and then x-coordinate
|
||||||
|
contours = sorted(contours, key=lambda contour: (cv2.boundingRect(contour)[1], cv2.boundingRect(contour)[0]))
|
||||||
|
|
||||||
|
# Extract text from each contour
|
||||||
|
for contour in contours:
|
||||||
|
x, y, w, h = cv2.boundingRect(contour)
|
||||||
|
box_size = w * h
|
||||||
|
if box_size >= min_size and box_size <= max_size:
|
||||||
|
# Draw bounding box around contour
|
||||||
|
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
||||||
|
|
||||||
|
# for contour in contours:
|
||||||
|
|
||||||
|
# x, y, w, h = cv2.boundingRect(contour)
|
||||||
|
# box_size = w * h
|
||||||
|
# if box_size >= min_size and box_size <= max_size:
|
||||||
|
# cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 1)
|
||||||
|
# cv2.putText(img, "Box", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
|
||||||
|
box = img[y:y + h, x:x + w]
|
||||||
|
#cv2_imshow(box)
|
||||||
|
result = ocr.ocr(box)
|
||||||
|
try:
|
||||||
|
txts = [line[1][0] for line in result[0]]
|
||||||
|
|
||||||
|
txts.remove('Photo is')
|
||||||
|
txts.remove('Available')
|
||||||
|
|
||||||
|
age = [re.findall(r"\d{2}", i)[0] for i in txts if "Age" in i and re.findall(r"\d{2}", i)]
|
||||||
|
voter_ID = [element for element in txts if
|
||||||
|
element.isalnum() and not element.isalpha() and not element.isdigit()]
|
||||||
|
all_data = [item.lower().replace('name', 'Name:').replace('number', 'Number:').replace("gender", "',Gender:'")
|
||||||
|
for item in txts]
|
||||||
|
|
||||||
|
all_data[0] = ":" + all_data[0]
|
||||||
|
all_data[1] = ":" + all_data[1]
|
||||||
|
for i in range(1, len(all_data)):
|
||||||
|
if ":" not in all_data[i]:
|
||||||
|
all_data[i - 1] += " " + all_data[i]
|
||||||
|
all_data[i] = ""
|
||||||
|
|
||||||
|
# Remove empty elements from the list
|
||||||
|
my_list = [element for element in all_data if element != ""]
|
||||||
|
# my_list = [all_data[i-1] + " " + all_data[i] if ":" not in all_data[i] else all_data[i] for i in range(0, len(all_data)) if all_data[i] != ""]
|
||||||
|
my_list = [my_list[i][1:] if my_list[i].startswith(":") else my_list[i] for i in range(len(my_list))]
|
||||||
|
my_list = [s.replace('::', ':') for s in my_list]
|
||||||
|
|
||||||
|
list_of_elements.append("Age: " + age[0])
|
||||||
|
list_of_elements.append("Voter Number: " + voter_ID[0])
|
||||||
|
|
||||||
|
for i in my_list:
|
||||||
|
if i.startswith("Name"):
|
||||||
|
person_name = i.split(":")[-1].upper()
|
||||||
|
list_of_elements.append("Name: " + person_name)
|
||||||
|
elif "father" in i:
|
||||||
|
fathername = i.split(":")[-1].upper()
|
||||||
|
list_of_elements.append("Father Name: " + fathername)
|
||||||
|
elif "husband" in i:
|
||||||
|
husband_name = i.split(":")[-1].upper()
|
||||||
|
list_of_elements.append("Husband Name: " + husband_name)
|
||||||
|
elif "mother" in i:
|
||||||
|
mother_name = i.split(":")[-1].upper()
|
||||||
|
list_of_elements.append("Mother Name: " + mother_name)
|
||||||
|
elif "Gender" in i:
|
||||||
|
gender = i.split(":")[-1].upper()
|
||||||
|
list_of_elements.append("Gender: " + gender)
|
||||||
|
elif "house" in i:
|
||||||
|
house = i.split(":")[-1].upper()
|
||||||
|
list_of_elements.append("House Number: " + house + ',' + street)
|
||||||
|
elif "wife" in i:
|
||||||
|
wife = i.split(":")[-1].upper()
|
||||||
|
list_of_elements.append("Wife Name: " + wife)
|
||||||
|
elif "other" in i:
|
||||||
|
other = i.split(":")[-1].upper()
|
||||||
|
list_of_elements.append("Others Name: " + other)
|
||||||
|
|
||||||
|
df = pd.DataFrame(list_of_elements)
|
||||||
|
#df=df.T
|
||||||
|
data = dict([val.split(': ') for val in df[0]])
|
||||||
|
list_of_elements.clear()
|
||||||
|
print(data)
|
||||||
|
writer.writerow(data)
|
||||||
|
csvfile.flush()
|
||||||
|
except (IndexError,ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
files = glob.glob('D:\\text_extraction\\uploads\\*')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
files = glob.glob('D:\\text_extraction\\images\\*')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return 'done'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(host='0.0.0.0',port=1112)
|
||||||
Reference in New Issue
Block a user