|
@@ -0,0 +1,200 @@
|
|
1
|
+from flask import Flask, render_template, request, redirect, Response, send_file
|
|
2
|
+import glob
|
|
3
|
+from pdf2image import convert_from_path
|
|
4
|
+from PyPDF2 import PdfReader
|
|
5
|
+
|
|
6
|
+import pandas as pd
|
|
7
|
+import csv
|
|
8
|
+import os
|
|
9
|
+import cv2
|
|
10
|
+import numpy as np
|
|
11
|
+from paddleocr import PaddleOCR, draw_ocr
|
|
12
|
+import re
|
|
13
|
+#ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
|
|
14
|
+
|
|
15
|
+app = Flask(__name__)
|
|
16
|
+
|
|
17
|
+@app.route('/pdftotext', methods=["POST"])
|
|
18
|
+def pdftotext():
|
|
19
|
+
|
|
20
|
+ Dataset = request.get_json()
|
|
21
|
+
|
|
22
|
+ data = {'visiting': Dataset}
|
|
23
|
+ for i in data['visiting']:
|
|
24
|
+ import time
|
|
25
|
+ #time.sleep(1)
|
|
26
|
+ a = i
|
|
27
|
+ x = a['FileData']
|
|
28
|
+ # print(x)
|
|
29
|
+ y = a['FileName']
|
|
30
|
+ z = a['FileType']
|
|
31
|
+ # CreatedBy=a['CreatedBy']
|
|
32
|
+
|
|
33
|
+ name = y + '.' + z
|
|
34
|
+ print(name)
|
|
35
|
+ # print(y)
|
|
36
|
+ # image = y.split("/")
|
|
37
|
+ # filename=image[-1]
|
|
38
|
+
|
|
39
|
+ # print(x)
|
|
40
|
+ img_data = x.encode()
|
|
41
|
+
|
|
42
|
+ import base64
|
|
43
|
+ with open('./uploads/' + name, "wb") as fh:
|
|
44
|
+ fh.write(base64.decodebytes(img_data))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+ # pdf to images
|
|
50
|
+ images = convert_from_path('./uploads/' + name)
|
|
51
|
+
|
|
52
|
+ for i in range(len(images)):
|
|
53
|
+ images[i].save('./images/'+ str(i) +'.jpg', 'JPEG')
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+ #count of a pdf files
|
|
57
|
+ reader = PdfReader('./uploads/' + name)
|
|
58
|
+ num = len(reader.pages)
|
|
59
|
+ print(num)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+ import pandas as pd
|
|
63
|
+ ocr = PaddleOCR(use_gpu=True,use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
|
|
64
|
+ dframe = pd.DataFrame({'Name': [], 'Father Name': [], 'Mother Name':[],'Husband Name':[],'Wife Name':[] , 'Others Name':[] ,'House Number': [],'Age': [], 'Gender': [],'Voter Number':[]})
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+ # Assuming 'dframe' is the DataFrame with column names
|
|
69
|
+ with open(y+'.csv', 'a', newline='') as csvfile:
|
|
70
|
+ writer = csv.DictWriter(csvfile, fieldnames=dframe.columns)
|
|
71
|
+ writer.writeheader()
|
|
72
|
+
|
|
73
|
+ list_of_elements=[]
|
|
74
|
+
|
|
75
|
+ for i in range(num):
|
|
76
|
+ path='./images/'+ str(i)+'.jpg'
|
|
77
|
+ print(path)
|
|
78
|
+
|
|
79
|
+ img = cv2.imread(path)
|
|
80
|
+ if img.shape[0] > 1000:
|
|
81
|
+ img = cv2.resize(img, (int(img.shape[1]*0.7), int(img.shape[0]*0.7)))
|
|
82
|
+ result1 = ocr.ocr(img)
|
|
83
|
+ txts1 = [line[1][0] for line in result1[0]]
|
|
84
|
+ #print(txts1)
|
|
85
|
+
|
|
86
|
+ for i in txts1:
|
|
87
|
+ if 'Section' in i:
|
|
88
|
+ street = (i.split(':')[-1])
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
93
|
+ ret, thresh = cv2.threshold(gray, 250, 300, cv2.THRESH_BINARY)
|
|
94
|
+ contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
|
95
|
+ import pandas as pd
|
|
96
|
+ min_size = 9000
|
|
97
|
+ max_size = 100000
|
|
98
|
+
|
|
99
|
+ # Sort contours based on y-coordinate and then x-coordinate
|
|
100
|
+ contours = sorted(contours, key=lambda contour: (cv2.boundingRect(contour)[1], cv2.boundingRect(contour)[0]))
|
|
101
|
+
|
|
102
|
+ # Extract text from each contour
|
|
103
|
+ for contour in contours:
|
|
104
|
+ x, y, w, h = cv2.boundingRect(contour)
|
|
105
|
+ box_size = w * h
|
|
106
|
+ if box_size >= min_size and box_size <= max_size:
|
|
107
|
+ # Draw bounding box around contour
|
|
108
|
+ cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
|
109
|
+
|
|
110
|
+ # for contour in contours:
|
|
111
|
+
|
|
112
|
+ # x, y, w, h = cv2.boundingRect(contour)
|
|
113
|
+ # box_size = w * h
|
|
114
|
+ # if box_size >= min_size and box_size <= max_size:
|
|
115
|
+ # cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 1)
|
|
116
|
+ # cv2.putText(img, "Box", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
|
|
117
|
+ box = img[y:y + h, x:x + w]
|
|
118
|
+ #cv2_imshow(box)
|
|
119
|
+ result = ocr.ocr(box)
|
|
120
|
+ try:
|
|
121
|
+ txts = [line[1][0] for line in result[0]]
|
|
122
|
+
|
|
123
|
+ txts.remove('Photo is')
|
|
124
|
+ txts.remove('Available')
|
|
125
|
+
|
|
126
|
+ age = [re.findall(r"\d{2}", i)[0] for i in txts if "Age" in i and re.findall(r"\d{2}", i)]
|
|
127
|
+ voter_ID = [element for element in txts if
|
|
128
|
+ element.isalnum() and not element.isalpha() and not element.isdigit()]
|
|
129
|
+ all_data = [item.lower().replace('name', 'Name:').replace('number', 'Number:').replace("gender", "',Gender:'")
|
|
130
|
+ for item in txts]
|
|
131
|
+
|
|
132
|
+ all_data[0] = ":" + all_data[0]
|
|
133
|
+ all_data[1] = ":" + all_data[1]
|
|
134
|
+ for i in range(1, len(all_data)):
|
|
135
|
+ if ":" not in all_data[i]:
|
|
136
|
+ all_data[i - 1] += " " + all_data[i]
|
|
137
|
+ all_data[i] = ""
|
|
138
|
+
|
|
139
|
+ # Remove empty elements from the list
|
|
140
|
+ my_list = [element for element in all_data if element != ""]
|
|
141
|
+ # my_list = [all_data[i-1] + " " + all_data[i] if ":" not in all_data[i] else all_data[i] for i in range(0, len(all_data)) if all_data[i] != ""]
|
|
142
|
+ my_list = [my_list[i][1:] if my_list[i].startswith(":") else my_list[i] for i in range(len(my_list))]
|
|
143
|
+ my_list = [s.replace('::', ':') for s in my_list]
|
|
144
|
+
|
|
145
|
+ list_of_elements.append("Age: " + age[0])
|
|
146
|
+ list_of_elements.append("Voter Number: " + voter_ID[0])
|
|
147
|
+
|
|
148
|
+ for i in my_list:
|
|
149
|
+ if i.startswith("Name"):
|
|
150
|
+ person_name = i.split(":")[-1].upper()
|
|
151
|
+ list_of_elements.append("Name: " + person_name)
|
|
152
|
+ elif "father" in i:
|
|
153
|
+ fathername = i.split(":")[-1].upper()
|
|
154
|
+ list_of_elements.append("Father Name: " + fathername)
|
|
155
|
+ elif "husband" in i:
|
|
156
|
+ husband_name = i.split(":")[-1].upper()
|
|
157
|
+ list_of_elements.append("Husband Name: " + husband_name)
|
|
158
|
+ elif "mother" in i:
|
|
159
|
+ mother_name = i.split(":")[-1].upper()
|
|
160
|
+ list_of_elements.append("Mother Name: " + mother_name)
|
|
161
|
+ elif "Gender" in i:
|
|
162
|
+ gender = i.split(":")[-1].upper()
|
|
163
|
+ list_of_elements.append("Gender: " + gender)
|
|
164
|
+ elif "house" in i:
|
|
165
|
+ house = i.split(":")[-1].upper()
|
|
166
|
+ list_of_elements.append("House Number: " + house + ',' + street)
|
|
167
|
+ elif "wife" in i:
|
|
168
|
+ wife = i.split(":")[-1].upper()
|
|
169
|
+ list_of_elements.append("Wife Name: " + wife)
|
|
170
|
+ elif "other" in i:
|
|
171
|
+ other = i.split(":")[-1].upper()
|
|
172
|
+ list_of_elements.append("Others Name: " + other)
|
|
173
|
+
|
|
174
|
+ df = pd.DataFrame(list_of_elements)
|
|
175
|
+ #df=df.T
|
|
176
|
+ data = dict([val.split(': ') for val in df[0]])
|
|
177
|
+ list_of_elements.clear()
|
|
178
|
+ print(data)
|
|
179
|
+ writer.writerow(data)
|
|
180
|
+ csvfile.flush()
|
|
181
|
+ except (IndexError,ValueError):
|
|
182
|
+ pass
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+ files = glob.glob('D:\\text_extraction\\uploads\\*')
|
|
186
|
+ for f in files:
|
|
187
|
+ os.remove(f)
|
|
188
|
+
|
|
189
|
+ files = glob.glob('D:\\text_extraction\\images\\*')
|
|
190
|
+ for f in files:
|
|
191
|
+ os.remove(f)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+ return 'done'
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+if __name__ == "__main__":
|
|
200
|
+ app.run(host='0.0.0.0',port=1112)
|