Generalized source packager --assumes minimal metadata - doc_name, page_num and text chunk --add to existing 'state' source & create new batch on top if overflow
(self, retrieval_material, aggregate_source=True, add_to_prompt=True,
backup_source_filename="user_provided_unknown_source")
| 1988 | return toks |
| 1989 | |
| 1990 | def package_source(self, retrieval_material, aggregate_source=True, add_to_prompt=True, |
| 1991 | backup_source_filename="user_provided_unknown_source"): |
| 1992 | |
| 1993 | """ Generalized source packager |
| 1994 | --assumes minimal metadata - doc_name, page_num and text chunk |
| 1995 | --add to existing 'state' source & create new batch on top if overflow """ |
| 1996 | |
| 1997 | # tracking variables |
| 1998 | tokens_per_batch = [] |
| 1999 | samples_per_batch = [] |
| 2000 | sample_counter = 0 |
| 2001 | doc_sources = {} |
| 2002 | |
| 2003 | doc_sources_per_batch = {} |
| 2004 | |
| 2005 | biblio_per_batch = [] |
| 2006 | batches = [] |
| 2007 | meta = [] |
| 2008 | |
| 2009 | samples = [] |
| 2010 | |
| 2011 | for i, q in enumerate(retrieval_material): |
| 2012 | |
| 2013 | # simple deduplication check to remove identical entries - more 'cleaning' options can be offered over time |
| 2014 | if q not in samples: |
| 2015 | samples.append(q) |
| 2016 | |
| 2017 | # default |
| 2018 | current_batch = "" |
| 2019 | token_counter = 0 |
| 2020 | batch_metadata = [] |
| 2021 | batch_id = 0 |
| 2022 | char_counter = 0 |
| 2023 | |
| 2024 | if aggregate_source: |
| 2025 | # start current batch with the last entry in source materials and aggregate from this point |
| 2026 | if len(self.source_client.source_materials) > 0: |
| 2027 | |
| 2028 | # pull up the last 'in-progress' entry in current source materials state |
| 2029 | current_batch = self.source_client.source_materials[-1]["text"] |
| 2030 | token_counter = self.token_counter(current_batch) |
| 2031 | char_counter = len(current_batch) |
| 2032 | batch_metadata = self.source_client.source_materials[-1]["metadata"] |
| 2033 | batch_stats = self.source_client.source_materials[-1]["batch_stats"] |
| 2034 | batch_id = len(self.source_client.source_materials) - 1 |
| 2035 | |
| 2036 | # experiment |
| 2037 | doc_sources_per_batch = self.source_client.source_materials[-1]["biblio"] |
| 2038 | |
| 2039 | # end - experiment |
| 2040 | |
| 2041 | # 'pop' the last entry 'in-progress' off the list |
| 2042 | self.source_client.source_materials = self.source_client.source_materials[:-1] |
| 2043 | |
| 2044 | samples_chunked = [] |
| 2045 | |
| 2046 | for x in range(0,len(samples)): |
| 2047 |
no test coverage detected