add privategpt
example
This commit is contained in:
parent
efae43f932
commit
b1c88eb978
9 changed files with 5778 additions and 0 deletions
170
examples/privategpt/.gitignore
vendored
Normal file
170
examples/privategpt/.gitignore
vendored
Normal file
|
@ -0,0 +1,170 @@
|
|||
# OSX
|
||||
.DS_STORE
|
||||
|
||||
# Models
|
||||
models/
|
||||
|
||||
# Local Chroma db
|
||||
.chroma/
|
||||
db/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
201
examples/privategpt/LICENSE
Normal file
201
examples/privategpt/LICENSE
Normal file
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
66
examples/privategpt/README.md
Normal file
66
examples/privategpt/README.md
Normal file
|
@ -0,0 +1,66 @@
|
|||
# privateGPT with Llama 2 Uncensored
|
||||
|
||||
> Note: this example is a simplified version of [PrivateGPT](https://github.com/imartinez/privateGPT) that works with Llama 2 Uncensored.
|
||||
|
||||
### Setup
|
||||
|
||||
```shell
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Getting WeWork's latest quarterly report
|
||||
|
||||
```
|
||||
curl https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf -o source_documents/wework.pdf
|
||||
```
|
||||
|
||||
### Ingesting data
|
||||
|
||||
```shell
|
||||
python ingest.py
|
||||
```
|
||||
|
||||
Output should look like this:
|
||||
|
||||
```shell
|
||||
Creating new vectorstore
|
||||
Loading documents from source_documents
|
||||
Loading new documents: 100%|██████████████████████| 1/1 [00:01<00:00, 1.73s/it]
|
||||
Loaded 1 new documents from source_documents
|
||||
Split into 90 chunks of text (max. 500 tokens each)
|
||||
Creating embeddings. May take some minutes...
|
||||
Using embedded DuckDB with persistence: data will be stored in: db
|
||||
Ingestion complete! You can now run privateGPT.py to query your documents
|
||||
```
|
||||
|
||||
### Ask Questions!
|
||||
|
||||
```shell
|
||||
python privateGPT.py
|
||||
|
||||
Enter a query: How many locations does WeWork have?
|
||||
|
||||
> Answer (took 17.7 s.):
|
||||
As of June 2023, WeWork has 777 locations worldwide, including 610 Consolidated Locations (as defined in the section entitled Key Performance Indicators).
|
||||
```
|
||||
|
||||
## Adding your own data
|
||||
|
||||
Put any and all your files into the `source_documents` directory
|
||||
|
||||
The supported extensions are:
|
||||
|
||||
- `.csv`: CSV,
|
||||
- `.docx`: Word Document,
|
||||
- `.doc`: Word Document,
|
||||
- `.enex`: EverNote,
|
||||
- `.eml`: Email,
|
||||
- `.epub`: EPub,
|
||||
- `.html`: HTML File,
|
||||
- `.md`: Markdown,
|
||||
- `.msg`: Outlook Message,
|
||||
- `.odt`: Open Document Text,
|
||||
- `.pdf`: Portable Document Format (PDF),
|
||||
- `.pptx` : PowerPoint Document,
|
||||
- `.ppt` : PowerPoint Document,
|
||||
- `.txt`: Text file (UTF-8),
|
15
examples/privategpt/constants.py
Normal file
15
examples/privategpt/constants.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import os
|
||||
from dotenv import load_dotenv
|
||||
from chromadb.config import Settings
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Define the folder for storing database
|
||||
PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY', 'db')
|
||||
|
||||
# Define the Chroma settings
|
||||
CHROMA_SETTINGS = Settings(
|
||||
chroma_db_impl='duckdb+parquet',
|
||||
persist_directory=PERSIST_DIRECTORY,
|
||||
anonymized_telemetry=False
|
||||
)
|
166
examples/privategpt/ingest.py
Executable file
166
examples/privategpt/ingest.py
Executable file
|
@ -0,0 +1,166 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import glob
|
||||
from typing import List
|
||||
from dotenv import load_dotenv
|
||||
from multiprocessing import Pool
|
||||
from tqdm import tqdm
|
||||
|
||||
from langchain.document_loaders import (
|
||||
CSVLoader,
|
||||
EverNoteLoader,
|
||||
PyMuPDFLoader,
|
||||
TextLoader,
|
||||
UnstructuredEmailLoader,
|
||||
UnstructuredEPubLoader,
|
||||
UnstructuredHTMLLoader,
|
||||
UnstructuredMarkdownLoader,
|
||||
UnstructuredODTLoader,
|
||||
UnstructuredPowerPointLoader,
|
||||
UnstructuredWordDocumentLoader,
|
||||
)
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.vectorstores import Chroma
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
from langchain.docstore.document import Document
|
||||
from constants import CHROMA_SETTINGS
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# Load environment variables
|
||||
persist_directory = os.environ.get('PERSIST_DIRECTORY', 'db')
|
||||
source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
|
||||
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME', 'all-MiniLM-L6-v2')
|
||||
chunk_size = 500
|
||||
chunk_overlap = 50
|
||||
|
||||
|
||||
# Custom document loaders
|
||||
class MyElmLoader(UnstructuredEmailLoader):
|
||||
"""Wrapper to fallback to text/plain when default does not work"""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Wrapper adding fallback for elm without html"""
|
||||
try:
|
||||
try:
|
||||
doc = UnstructuredEmailLoader.load(self)
|
||||
except ValueError as e:
|
||||
if 'text/html content not found in email' in str(e):
|
||||
# Try plain text
|
||||
self.unstructured_kwargs["content_source"]="text/plain"
|
||||
doc = UnstructuredEmailLoader.load(self)
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
# Add file_path to exception message
|
||||
raise type(e)(f"{self.file_path}: {e}") from e
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
# Map file extensions to document loaders and their arguments
|
||||
LOADER_MAPPING = {
|
||||
".csv": (CSVLoader, {}),
|
||||
# ".docx": (Docx2txtLoader, {}),
|
||||
".doc": (UnstructuredWordDocumentLoader, {}),
|
||||
".docx": (UnstructuredWordDocumentLoader, {}),
|
||||
".enex": (EverNoteLoader, {}),
|
||||
".eml": (MyElmLoader, {}),
|
||||
".epub": (UnstructuredEPubLoader, {}),
|
||||
".html": (UnstructuredHTMLLoader, {}),
|
||||
".md": (UnstructuredMarkdownLoader, {}),
|
||||
".odt": (UnstructuredODTLoader, {}),
|
||||
".pdf": (PyMuPDFLoader, {}),
|
||||
".ppt": (UnstructuredPowerPointLoader, {}),
|
||||
".pptx": (UnstructuredPowerPointLoader, {}),
|
||||
".txt": (TextLoader, {"encoding": "utf8"}),
|
||||
# Add more mappings for other file extensions and loaders as needed
|
||||
}
|
||||
|
||||
|
||||
def load_single_document(file_path: str) -> List[Document]:
|
||||
ext = "." + file_path.rsplit(".", 1)[-1]
|
||||
if ext in LOADER_MAPPING:
|
||||
loader_class, loader_args = LOADER_MAPPING[ext]
|
||||
loader = loader_class(file_path, **loader_args)
|
||||
return loader.load()
|
||||
|
||||
raise ValueError(f"Unsupported file extension '{ext}'")
|
||||
|
||||
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
|
||||
"""
|
||||
Loads all documents from the source documents directory, ignoring specified files
|
||||
"""
|
||||
all_files = []
|
||||
for ext in LOADER_MAPPING:
|
||||
all_files.extend(
|
||||
glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
|
||||
)
|
||||
filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
|
||||
|
||||
with Pool(processes=os.cpu_count()) as pool:
|
||||
results = []
|
||||
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
|
||||
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
|
||||
results.extend(docs)
|
||||
pbar.update()
|
||||
|
||||
return results
|
||||
|
||||
def process_documents(ignored_files: List[str] = []) -> List[Document]:
|
||||
"""
|
||||
Load documents and split in chunks
|
||||
"""
|
||||
print(f"Loading documents from {source_directory}")
|
||||
documents = load_documents(source_directory, ignored_files)
|
||||
if not documents:
|
||||
print("No new documents to load")
|
||||
exit(0)
|
||||
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
texts = text_splitter.split_documents(documents)
|
||||
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
||||
return texts
|
||||
|
||||
def does_vectorstore_exist(persist_directory: str) -> bool:
|
||||
"""
|
||||
Checks if vectorstore exists
|
||||
"""
|
||||
if os.path.exists(os.path.join(persist_directory, 'index')):
|
||||
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
|
||||
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
|
||||
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
|
||||
# At least 3 documents are needed in a working vectorstore
|
||||
if len(list_index_files) > 3:
|
||||
return True
|
||||
return False
|
||||
|
||||
def main():
|
||||
# Create embeddings
|
||||
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
||||
|
||||
if does_vectorstore_exist(persist_directory):
|
||||
# Update and store locally vectorstore
|
||||
print(f"Appending to existing vectorstore at {persist_directory}")
|
||||
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
|
||||
collection = db.get()
|
||||
texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
|
||||
print(f"Creating embeddings. May take some minutes...")
|
||||
db.add_documents(texts)
|
||||
else:
|
||||
# Create and store locally vectorstore
|
||||
print("Creating new vectorstore")
|
||||
texts = process_documents()
|
||||
print(f"Creating embeddings. May take some minutes...")
|
||||
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
|
||||
db.persist()
|
||||
db = None
|
||||
|
||||
print(f"Ingestion complete! You can now run privateGPT.py to query your documents")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
3310
examples/privategpt/poetry.lock
generated
Normal file
3310
examples/privategpt/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
72
examples/privategpt/privateGPT.py
Executable file
72
examples/privategpt/privateGPT.py
Executable file
|
@ -0,0 +1,72 @@
|
|||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
from langchain.chains import RetrievalQA
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||
from langchain.vectorstores import Chroma
|
||||
from langchain.llms import GPT4All, Ollama
|
||||
import os
|
||||
import argparse
|
||||
import time
|
||||
|
||||
load_dotenv()
|
||||
|
||||
model = os.environ.get("MODEL", "llama2-uncensored")
|
||||
embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME", "all-MiniLM-L6-v2")
|
||||
persist_directory = os.environ.get("PERSIST_DIRECTORY", "db")
|
||||
target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4))
|
||||
|
||||
from constants import CHROMA_SETTINGS
|
||||
|
||||
def main():
|
||||
# Parse the command line arguments
|
||||
args = parse_arguments()
|
||||
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
||||
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
|
||||
retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
|
||||
# activate/deactivate the streaming StdOut callback for LLMs
|
||||
callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
|
||||
|
||||
llm = Ollama(model=model, callbacks=callbacks)
|
||||
|
||||
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
|
||||
# Interactive questions and answers
|
||||
while True:
|
||||
query = input("\nEnter a query: ")
|
||||
if query == "exit":
|
||||
break
|
||||
if query.strip() == "":
|
||||
continue
|
||||
|
||||
# Get the answer from the chain
|
||||
start = time.time()
|
||||
res = qa(query)
|
||||
answer, docs = res['result'], [] if args.hide_source else res['source_documents']
|
||||
end = time.time()
|
||||
|
||||
# Print the result
|
||||
print("\n\n> Question:")
|
||||
print(query)
|
||||
print(f"\n> Answer (took {round(end - start, 2)} s.):")
|
||||
print(answer)
|
||||
|
||||
# Print the relevant sources used for the answer
|
||||
for document in docs:
|
||||
print("\n> " + document.metadata["source"] + ":")
|
||||
print(document.page_content)
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description='privateGPT: Ask questions to your documents without an internet connection, '
|
||||
'using the power of LLMs.')
|
||||
parser.add_argument("--hide-source", "-S", action='store_true',
|
||||
help='Use this flag to disable printing of source documents used for answers.')
|
||||
|
||||
parser.add_argument("--mute-stream", "-M",
|
||||
action='store_true',
|
||||
help='Use this flag to disable the streaming StdOut callback for LLMs.')
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
25
examples/privategpt/pyproject.toml
Normal file
25
examples/privategpt/pyproject.toml
Normal file
|
@ -0,0 +1,25 @@
|
|||
[tool.poetry]
|
||||
name = "privategpt"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Ivan Martinez <ivanmartit@gmail.com>"]
|
||||
license = "Apache Version 2.0"
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
langchain = "0.0.261"
|
||||
gpt4all = "^1.0.3"
|
||||
chromadb = "^0.3.26"
|
||||
PyMuPDF = "^1.22.5"
|
||||
python-dotenv = "^1.0.0"
|
||||
unstructured = "^0.8.0"
|
||||
extract-msg = "^0.41.5"
|
||||
tabulate = "^0.9.0"
|
||||
pandoc = "^2.3"
|
||||
pypandoc = "^1.11"
|
||||
tqdm = "^4.65.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
1753
examples/privategpt/requirements.txt
Normal file
1753
examples/privategpt/requirements.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue