瀏覽代碼

Upload files to 'Electrol'

SadhulaSaiKumar 2 年之前
父節點
當前提交
f0d96c99ba
共有 1 個檔案被更改,包括 200 行新增0 行删除
  1. 200
    0
      Electrol/app.py

+ 200
- 0
Electrol/app.py 查看文件

@@ -0,0 +1,200 @@
1
+from flask import Flask, render_template, request, redirect, Response, send_file
2
+import glob
3
+from pdf2image import convert_from_path
4
+from PyPDF2 import PdfReader
5
+
6
+import pandas as pd
7
+import csv
8
+import os
9
+import cv2
10
+import numpy as np
11
+from paddleocr import PaddleOCR, draw_ocr
12
+import re
13
+#ocr = PaddleOCR(use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
14
+
15
+app = Flask(__name__)
16
+
17
+@app.route('/pdftotext', methods=["POST"])
18
+def pdftotext():
19
+
20
+    Dataset = request.get_json()
21
+
22
+    data = {'visiting': Dataset}
23
+    for i in data['visiting']:
24
+        import time
25
+        #time.sleep(1)
26
+        a = i
27
+        x = a['FileData']
28
+        # print(x)
29
+        y = a['FileName']
30
+        z = a['FileType']
31
+        # CreatedBy=a['CreatedBy']
32
+
33
+        name = y + '.' + z
34
+        print(name)
35
+        # print(y)
36
+        # image = y.split("/")
37
+        # filename=image[-1]
38
+
39
+        # print(x)
40
+        img_data = x.encode()
41
+
42
+        import base64
43
+        with open('./uploads/' + name, "wb") as fh:
44
+            fh.write(base64.decodebytes(img_data))
45
+
46
+
47
+
48
+
49
+       # pdf to images
50
+        images = convert_from_path('./uploads/' + name)
51
+        
52
+        for i in range(len(images)):
53
+            images[i].save('./images/'+ str(i) +'.jpg', 'JPEG')
54
+        
55
+        
56
+        #count of a pdf files
57
+        reader = PdfReader('./uploads/' + name)
58
+        num = len(reader.pages)
59
+        print(num)
60
+        
61
+        
62
+        import pandas as pd
63
+        ocr = PaddleOCR(use_gpu=True,use_angle_cls=True, lang='en', use_space_char=True, show_log=False,type='structure',image_enhance=True)
64
+        dframe = pd.DataFrame({'Name': [], 'Father Name': [], 'Mother Name':[],'Husband Name':[],'Wife Name':[] , 'Others Name':[] ,'House Number': [],'Age': [], 'Gender': [],'Voter Number':[]})
65
+        
66
+        
67
+        
68
+        # Assuming 'dframe' is the DataFrame with column names
69
+        with open(y+'.csv', 'a', newline='') as csvfile:
70
+            writer = csv.DictWriter(csvfile, fieldnames=dframe.columns)
71
+            writer.writeheader()
72
+        
73
+            list_of_elements=[]
74
+        
75
+            for i in range(num):
76
+              path='./images/'+ str(i)+'.jpg'
77
+              print(path)
78
+        
79
+              img = cv2.imread(path)
80
+              if img.shape[0] > 1000:
81
+                img = cv2.resize(img, (int(img.shape[1]*0.7), int(img.shape[0]*0.7)))  
82
+              result1 = ocr.ocr(img)
83
+              txts1 = [line[1][0] for line in result1[0]]
84
+              #print(txts1)
85
+        
86
+              for i in txts1:
87
+                  if 'Section' in i:
88
+                      street = (i.split(':')[-1])
89
+        
90
+        
91
+        
92
+              gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
93
+              ret, thresh = cv2.threshold(gray, 250, 300, cv2.THRESH_BINARY)
94
+              contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
95
+              import pandas as pd
96
+              min_size = 9000
97
+              max_size = 100000
98
+        
99
+              # Sort contours based on y-coordinate and then x-coordinate
100
+              contours = sorted(contours, key=lambda contour: (cv2.boundingRect(contour)[1], cv2.boundingRect(contour)[0]))
101
+        
102
+              # Extract text from each contour
103
+              for contour in contours:
104
+                  x, y, w, h = cv2.boundingRect(contour)
105
+                  box_size = w * h
106
+                  if box_size >= min_size and box_size <= max_size:
107
+                      # Draw bounding box around contour
108
+                      cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
109
+        
110
+                      # for contour in contours:
111
+        
112
+                      #     x, y, w, h = cv2.boundingRect(contour)
113
+                      #     box_size = w * h
114
+                      #     if box_size >= min_size and box_size <= max_size:
115
+                      #       cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 1)
116
+                      # cv2.putText(img, "Box", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
117
+                      box = img[y:y + h, x:x + w]
118
+                      #cv2_imshow(box)
119
+                      result = ocr.ocr(box)
120
+                      try:
121
+                          txts = [line[1][0] for line in result[0]]
122
+        
123
+                          txts.remove('Photo is')
124
+                          txts.remove('Available')
125
+        
126
+                          age = [re.findall(r"\d{2}", i)[0] for i in txts if "Age" in i and re.findall(r"\d{2}", i)]
127
+                          voter_ID = [element for element in txts if
128
+                                      element.isalnum() and not element.isalpha() and not element.isdigit()]
129
+                          all_data = [item.lower().replace('name', 'Name:').replace('number', 'Number:').replace("gender", "',Gender:'")
130
+                                  for item in txts]
131
+        
132
+                          all_data[0] = ":" + all_data[0]
133
+                          all_data[1] = ":" + all_data[1]
134
+                          for i in range(1, len(all_data)):
135
+                              if ":" not in all_data[i]:
136
+                                  all_data[i - 1] += " " + all_data[i]
137
+                                  all_data[i] = ""
138
+        
139
+                              # Remove empty elements from the list
140
+                          my_list = [element for element in all_data if element != ""]
141
+                          # my_list = [all_data[i-1] + " " + all_data[i] if ":" not in all_data[i] else all_data[i] for i in range(0, len(all_data)) if all_data[i] != ""]
142
+                          my_list = [my_list[i][1:] if my_list[i].startswith(":") else my_list[i] for i in range(len(my_list))]
143
+                          my_list = [s.replace('::', ':') for s in my_list]
144
+        
145
+                          list_of_elements.append("Age: " + age[0])
146
+                          list_of_elements.append("Voter Number: " + voter_ID[0])
147
+        
148
+                          for i in my_list:
149
+                              if i.startswith("Name"):
150
+                                  person_name = i.split(":")[-1].upper()
151
+                                  list_of_elements.append("Name: " + person_name)
152
+                              elif "father" in i:
153
+                                  fathername = i.split(":")[-1].upper()
154
+                                  list_of_elements.append("Father Name: " + fathername)
155
+                              elif "husband" in i:
156
+                                  husband_name = i.split(":")[-1].upper()
157
+                                  list_of_elements.append("Husband Name: " + husband_name)
158
+                              elif "mother" in i:
159
+                                  mother_name = i.split(":")[-1].upper()
160
+                                  list_of_elements.append("Mother Name: " + mother_name)
161
+                              elif "Gender" in i:
162
+                                  gender = i.split(":")[-1].upper()
163
+                                  list_of_elements.append("Gender: " + gender)
164
+                              elif "house" in i:
165
+                                  house = i.split(":")[-1].upper()
166
+                                  list_of_elements.append("House Number: " + house + ',' + street)
167
+                              elif "wife" in i:
168
+                                  wife = i.split(":")[-1].upper()
169
+                                  list_of_elements.append("Wife Name: " + wife)
170
+                              elif "other" in i:
171
+                                  other = i.split(":")[-1].upper()
172
+                                  list_of_elements.append("Others Name: " + other)
173
+        
174
+                          df = pd.DataFrame(list_of_elements)
175
+                          #df=df.T
176
+                          data = dict([val.split(': ') for val in df[0]])
177
+                          list_of_elements.clear()
178
+                          print(data)
179
+                          writer.writerow(data)
180
+                          csvfile.flush()
181
+                      except (IndexError,ValueError):
182
+                        pass
183
+    
184
+
185
+        files = glob.glob('D:\\text_extraction\\uploads\\*')
186
+        for f in files:
187
+          os.remove(f)
188
+
189
+        files = glob.glob('D:\\text_extraction\\images\\*')
190
+        for f in files:
191
+          os.remove(f)
192
+
193
+
194
+
195
+    return 'done'   
196
+        
197
+
198
+
199
+if __name__ == "__main__":
200
+    app.run(host='0.0.0.0',port=1112)

Loading…
取消
儲存