diff --git a/examples/langchain-python-rag-privategpt/ingest.py b/examples/langchain-python-rag-privategpt/ingest.py index 35324775..0f71ccf0 100755 --- a/examples/langchain-python-rag-privategpt/ingest.py +++ b/examples/langchain-python-rag-privategpt/ingest.py @@ -77,13 +77,21 @@ LOADER_MAPPING = { def load_single_document(file_path: str) -> List[Document]: - ext = "." + file_path.rsplit(".", 1)[-1] - if ext in LOADER_MAPPING: - loader_class, loader_args = LOADER_MAPPING[ext] - loader = loader_class(file_path, **loader_args) - return loader.load() + if os.path.getsize(file_path) != 0: + filename, ext = os.path.splitext(file_path) + if ext in LOADER_MAPPING: + loader_class, loader_args = LOADER_MAPPING[ext] + try: + loader = loader_class(file_path, **loader_args) + if loader: + return loader.load() + except: + print(f"Corrupted file {file_path}. Ignoring it.") + else: + print(f"Unsupported file {file_path}. Ignoring it.") + else: + print(f"Empty file {file_path}. Ignoring it.") - raise ValueError(f"Unsupported file extension '{ext}'") def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]: """ @@ -100,7 +108,8 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum results = [] with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar: for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): - results.extend(docs) + if docs: + results.extend(docs) pbar.update() return results