|
@@ -0,0 +1,31 @@
|
|
1
|
+from langchain.embeddings import HuggingFaceEmbeddings
|
|
2
|
+from langchain.vectorstores import FAISS
|
|
3
|
+from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
|
4
|
+from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
5
|
+from langchain.document_loaders.csv_loader import CSVLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+DATA_PATH = 'data/'
|
|
9
|
+DB_FAISS_PATH = 'vectorstore/db_faiss'
|
|
10
|
+
|
|
11
|
+# Create vector database
|
|
12
|
+def create_vector_db():
|
|
13
|
+ loader = CSVLoader(file_path="./supportqa.csv", encoding='iso-8859-1', source_column="Question")
|
|
14
|
+ # loader = DirectoryLoader(DATA_PATH,
|
|
15
|
+ # glob='*.pdf',
|
|
16
|
+ # loader_cls=PyPDFLoader)
|
|
17
|
+
|
|
18
|
+ documents = loader.load()
|
|
19
|
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
|
|
20
|
+ chunk_overlap=50)
|
|
21
|
+ texts = text_splitter.split_documents(documents)
|
|
22
|
+
|
|
23
|
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
|
|
24
|
+ model_kwargs={'device': 'cpu'})
|
|
25
|
+
|
|
26
|
+ db = FAISS.from_documents(texts, embeddings)
|
|
27
|
+ db.save_local(DB_FAISS_PATH)
|
|
28
|
+
|
|
29
|
+if __name__ == "__main__":
|
|
30
|
+ create_vector_db()
|
|
31
|
+
|