Нема описа
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

12345678910111213141516171819202122232425262728293031
  1. from langchain.embeddings import HuggingFaceEmbeddings
  2. from langchain.vectorstores import FAISS
  3. from langchain.document_loaders import PyPDFLoader, DirectoryLoader
  4. from langchain.text_splitter import RecursiveCharacterTextSplitter
  5. from langchain.document_loaders.csv_loader import CSVLoader
  6. DATA_PATH = 'data/'
  7. DB_FAISS_PATH = 'vectorstore/db_faiss'
  8. # Create vector database
  9. def create_vector_db():
  10. loader = CSVLoader(file_path="./supportqa.csv", encoding='iso-8859-1', source_column="Question")
  11. # loader = DirectoryLoader(DATA_PATH,
  12. # glob='*.pdf',
  13. # loader_cls=PyPDFLoader)
  14. documents = loader.load()
  15. text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
  16. chunk_overlap=50)
  17. texts = text_splitter.split_documents(documents)
  18. embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
  19. model_kwargs={'device': 'cpu'})
  20. db = FAISS.from_documents(texts, embeddings)
  21. db.save_local(DB_FAISS_PATH)
  22. if __name__ == "__main__":
  23. create_vector_db()