diff --git a/Supportgpt/ingest.py b/Supportgpt/ingest.py new file mode 100644 index 0000000..12d8061 --- /dev/null +++ b/Supportgpt/ingest.py @@ -0,0 +1,31 @@ +from langchain.embeddings import HuggingFaceEmbeddings +from langchain.vectorstores import FAISS +from langchain.document_loaders import PyPDFLoader, DirectoryLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.document_loaders.csv_loader import CSVLoader + + +DATA_PATH = 'data/' +DB_FAISS_PATH = 'vectorstore/db_faiss' + +# Create vector database +def create_vector_db(): + loader = CSVLoader(file_path="./supportqa.csv", encoding='iso-8859-1', source_column="Question") + # loader = DirectoryLoader(DATA_PATH, + # glob='*.pdf', + # loader_cls=PyPDFLoader) + + documents = loader.load() + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, + chunk_overlap=50) + texts = text_splitter.split_documents(documents) + + embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', + model_kwargs={'device': 'cpu'}) + + db = FAISS.from_documents(texts, embeddings) + db.save_local(DB_FAISS_PATH) + +if __name__ == "__main__": + create_vector_db() +