(library_name, vector_db="chromadb")
| 19 | |
| 20 | |
| 21 | def embeddings_fast_start (library_name, vector_db="chromadb"): |
| 22 | |
| 23 | # Create and populate a library |
| 24 | print (f"\nstep 1 - creating and populating library: {library_name}...") |
| 25 | library = Library().create_new_library(library_name) |
| 26 | sample_files_path = Setup().load_sample_files() |
| 27 | library.add_files(input_folder_path=os.path.join(sample_files_path, "AgreementsLarge")) |
| 28 | |
| 29 | # To create vector embeddings you just need to specify the embedding model and the vector embedding DB |
| 30 | # For examples of using HuggingFace and SentenceTransformer models, see those examples in this same folder |
| 31 | |
| 32 | embedding_model = "mini-lm-sbert" |
| 33 | |
| 34 | print (f"\n > Generating embedding vectors and storing in '{vector_db}'...") |
| 35 | library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db) |
| 36 | |
| 37 | # Then when doing semantic queries, the most recent vector DB used for embeddings will be used. |
| 38 | |
| 39 | # We just find the best 3 hits for "Salary" |
| 40 | q = Query(library) |
| 41 | print (f"\n > Running a query for 'Salary'...") |
| 42 | query_results = q.semantic_query(query="Salary", result_count=10, results_only=True) |
| 43 | |
| 44 | for i, entries in enumerate(query_results): |
| 45 | |
| 46 | # each query result is a dictionary with many useful keys |
| 47 | |
| 48 | text = entries["text"] |
| 49 | document_source = entries["file_source"] |
| 50 | page_num = entries["page_num"] |
| 51 | vector_distance = entries["distance"] |
| 52 | |
| 53 | # for display purposes only, we will only show the first 100 characters of the text |
| 54 | if len(text) > 125: text = text[0:125] + " ... " |
| 55 | |
| 56 | print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} " |
| 57 | .format( i, document_source, page_num, vector_distance)) |
| 58 | |
| 59 | print("update: text sample - ", text) |
| 60 | |
| 61 | return query_results |
| 62 | |
| 63 | |
| 64 | if __name__ == "__main__": |
no test coverage detected