Error handling load_single_document() in ingest.py (#4852)
load_single_document() handles - corrupt files - empty (zero byte) files - unsupported file extensions
This commit is contained in:
parent
943172cbf4
commit
5528dd9d11
1 changed files with 16 additions and 7 deletions
|
@ -77,13 +77,21 @@ LOADER_MAPPING = {
|
||||||
|
|
||||||
|
|
||||||
def load_single_document(file_path: str) -> List[Document]:
|
def load_single_document(file_path: str) -> List[Document]:
|
||||||
ext = "." + file_path.rsplit(".", 1)[-1]
|
if os.path.getsize(file_path) != 0:
|
||||||
|
filename, ext = os.path.splitext(file_path)
|
||||||
if ext in LOADER_MAPPING:
|
if ext in LOADER_MAPPING:
|
||||||
loader_class, loader_args = LOADER_MAPPING[ext]
|
loader_class, loader_args = LOADER_MAPPING[ext]
|
||||||
|
try:
|
||||||
loader = loader_class(file_path, **loader_args)
|
loader = loader_class(file_path, **loader_args)
|
||||||
|
if loader:
|
||||||
return loader.load()
|
return loader.load()
|
||||||
|
except:
|
||||||
|
print(f"Corrupted file {file_path}. Ignoring it.")
|
||||||
|
else:
|
||||||
|
print(f"Unsupported file {file_path}. Ignoring it.")
|
||||||
|
else:
|
||||||
|
print(f"Empty file {file_path}. Ignoring it.")
|
||||||
|
|
||||||
raise ValueError(f"Unsupported file extension '{ext}'")
|
|
||||||
|
|
||||||
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
|
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
|
||||||
"""
|
"""
|
||||||
|
@ -100,6 +108,7 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum
|
||||||
results = []
|
results = []
|
||||||
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
|
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
|
||||||
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
|
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
|
||||||
|
if docs:
|
||||||
results.extend(docs)
|
results.extend(docs)
|
||||||
pbar.update()
|
pbar.update()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue