12345678910111213141516171819202122232425262728293031 |
- from langchain.embeddings import HuggingFaceEmbeddings
- from langchain.vectorstores import FAISS
- from langchain.document_loaders import PyPDFLoader, DirectoryLoader
- from langchain.text_splitter import RecursiveCharacterTextSplitter
- from langchain.document_loaders.csv_loader import CSVLoader
-
-
- DATA_PATH = 'data/'
- DB_FAISS_PATH = 'vectorstore/db_faiss'
-
- # Create vector database
- def create_vector_db():
- loader = CSVLoader(file_path="./supportqa.csv", encoding='iso-8859-1', source_column="Question")
- # loader = DirectoryLoader(DATA_PATH,
- # glob='*.pdf',
- # loader_cls=PyPDFLoader)
-
- documents = loader.load()
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
- chunk_overlap=50)
- texts = text_splitter.split_documents(documents)
-
- embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
- model_kwargs={'device': 'cpu'})
-
- db = FAISS.from_documents(texts, embeddings)
- db.save_local(DB_FAISS_PATH)
-
- if __name__ == "__main__":
- create_vector_db()
-
|