added python rag news summary
Signed-off-by: Matt Williams <m@technovangelist.com>
This commit is contained in:
parent
08b0e04f40
commit
c5c8b4b16a
4 changed files with 225 additions and 0 deletions
22
examples/python-rag-newssummary/README.md
Normal file
22
examples/python-rag-newssummary/README.md
Normal file
|
@ -0,0 +1,22 @@
|
|||
# News Summarizer
|
||||
|
||||
This example goes through a series of steps:
|
||||
|
||||
1. You choose a topic area (e.g., "news", "NVidia", "music", etc.).
|
||||
2. Gets the most recent articles on that topic from various sources.
|
||||
3. Uses Ollama to summarize each article.
|
||||
4. Creates chunks of sentences from each article.
|
||||
5. Uses Sentence Transformers to generate embeddings for each of those chunks.
|
||||
6. You enter a question regarding the summaries shown.
|
||||
7. Uses Sentence Transformers to generate an embedding for that question.
|
||||
8. Uses the embedded question to find the most similar chunks.
|
||||
9. Feeds all that to Ollama to generate a good answer to your question based on these news articles.
|
||||
|
||||
This example lets you pick from a few different topic areas, then summarize the most recent x articles for that topic. It then creates chunks of sentences from each article and then generates embeddings for each of those chunks.
|
||||
|
||||
You can run the example like this:
|
||||
|
||||
```bash
|
||||
python3 -m pip install -r requirements.txt
|
||||
python3 summ.py
|
||||
```
|
9
examples/python-rag-newssummary/requirements.txt
Normal file
9
examples/python-rag-newssummary/requirements.txt
Normal file
|
@ -0,0 +1,9 @@
|
|||
beautifulsoup4==4.12.2
|
||||
feedparser==6.0.10
|
||||
mattsollamatools==0.0.8
|
||||
newspaper3k==0.2.8
|
||||
nltk==3.8.1
|
||||
numpy==1.24.3
|
||||
Requests==2.31.0
|
||||
scikit_learn==1.3.0
|
||||
sentence_transformers==2.2.2
|
86
examples/python-rag-newssummary/summ.py
Normal file
86
examples/python-rag-newssummary/summ.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
import curses
|
||||
import json
|
||||
from utils import get_url_for_topic, topic_urls, menu, getUrls, get_summary, getArticleText, knn_search
|
||||
import requests
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from mattsollamatools import chunker
|
||||
|
||||
if __name__ == "__main__":
|
||||
chosen_topic = curses.wrapper(menu)
|
||||
print("Here is your news summary:\n")
|
||||
urls = getUrls(chosen_topic, n=5)
|
||||
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||
allEmbeddings = []
|
||||
|
||||
for url in urls:
|
||||
article={}
|
||||
article['embeddings'] = []
|
||||
article['url'] = url
|
||||
text = getArticleText(url)
|
||||
summary = get_summary(text)
|
||||
chunks = chunker(text) # Use the chunk_text function from web_utils
|
||||
embeddings = model.encode(chunks)
|
||||
for (chunk, embedding) in zip(chunks, embeddings):
|
||||
item = {}
|
||||
item['source'] = chunk
|
||||
item['embedding'] = embedding.tolist() # Convert NumPy array to list
|
||||
item['sourcelength'] = len(chunk)
|
||||
article['embeddings'].append(item)
|
||||
|
||||
allEmbeddings.append(article)
|
||||
|
||||
print(f"{summary}\n")
|
||||
|
||||
|
||||
while True:
|
||||
context = []
|
||||
# Input a question from the user
|
||||
question = input("Enter your question about the news, or type quit: ")
|
||||
|
||||
if question.lower() == 'quit':
|
||||
break
|
||||
|
||||
# Embed the user's question
|
||||
question_embedding = model.encode([question])
|
||||
|
||||
# Perform KNN search to find the best matches (indices and source text)
|
||||
best_matches = knn_search(question_embedding, allEmbeddings, k=10)
|
||||
|
||||
|
||||
sourcetext=""
|
||||
for i, (index, source_text) in enumerate(best_matches, start=1):
|
||||
sourcetext += f"{i}. Index: {index}, Source Text: {source_text}"
|
||||
|
||||
systemPrompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"
|
||||
|
||||
url = "http://localhost:11434/api/generate"
|
||||
|
||||
payload = {
|
||||
"model": "mistral-openorca",
|
||||
"prompt": question,
|
||||
"system": systemPrompt,
|
||||
"stream": False,
|
||||
"context": context
|
||||
}
|
||||
|
||||
# Convert the payload to a JSON string
|
||||
payload_json = json.dumps(payload)
|
||||
|
||||
# Set the headers to specify JSON content
|
||||
headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Send the POST request
|
||||
response = requests.post(url, data=payload_json, headers=headers)
|
||||
|
||||
# Check the response
|
||||
if response.status_code == 200:
|
||||
output = json.loads(response.text)
|
||||
context = output['context']
|
||||
print(output['response']+ "\n")
|
||||
|
||||
|
||||
else:
|
||||
print(f"Request failed with status code {response.status_code}")
|
||||
|
108
examples/python-rag-newssummary/utils.py
Normal file
108
examples/python-rag-newssummary/utils.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
import curses
|
||||
import feedparser
|
||||
import requests
|
||||
import unicodedata
|
||||
import json
|
||||
from newspaper import Article
|
||||
from bs4 import BeautifulSoup
|
||||
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||
import numpy as np
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from mattsollamatools import chunker
|
||||
|
||||
# Create a dictionary to store topics and their URLs
|
||||
topic_urls = {
|
||||
"Mac": "https://9to5mac.com/guides/mac/feed",
|
||||
"News": "http://www.npr.org/rss/rss.php?id=1001",
|
||||
"Nvidia": "https://nvidianews.nvidia.com/releases.xml",
|
||||
"Raspberry Pi": "https://www.raspberrypi.com/news/feed/",
|
||||
"Music": "https://www.billboard.com/c/music/music-news/feed/"
|
||||
}
|
||||
|
||||
# Use curses to create a menu of topics
|
||||
def menu(stdscr):
|
||||
chosen_topic = get_url_for_topic(stdscr)
|
||||
url = topic_urls[chosen_topic] if chosen_topic in topic_urls else "Topic not found"
|
||||
|
||||
stdscr.addstr(len(topic_urls) + 3, 0, f"Selected URL for {chosen_topic}: {url}")
|
||||
stdscr.refresh()
|
||||
|
||||
return chosen_topic
|
||||
|
||||
# You have chosen a topic. Now return the url for that topic
|
||||
def get_url_for_topic(stdscr):
|
||||
curses.curs_set(0) # Hide the cursor
|
||||
stdscr.clear()
|
||||
|
||||
stdscr.addstr(0, 0, "Choose a topic using the arrow keys (Press Enter to select):")
|
||||
|
||||
# Create a list of topics
|
||||
topics = list(topic_urls.keys())
|
||||
current_topic = 0
|
||||
|
||||
while True:
|
||||
for i, topic in enumerate(topics):
|
||||
if i == current_topic:
|
||||
stdscr.addstr(i + 2, 2, f"> {topic}")
|
||||
else:
|
||||
stdscr.addstr(i + 2, 2, f" {topic}")
|
||||
|
||||
stdscr.refresh()
|
||||
|
||||
key = stdscr.getch()
|
||||
|
||||
if key == curses.KEY_DOWN and current_topic < len(topics) - 1:
|
||||
current_topic += 1
|
||||
elif key == curses.KEY_UP and current_topic > 0:
|
||||
current_topic -= 1
|
||||
elif key == 10: # Enter key
|
||||
return topic_urls[topics[current_topic]]
|
||||
|
||||
# Get the last N URLs from an RSS feed
|
||||
def getUrls(feed_url, n=20):
|
||||
feed = feedparser.parse(feed_url)
|
||||
entries = feed.entries[-n:]
|
||||
urls = [entry.link for entry in entries]
|
||||
return urls
|
||||
|
||||
# Often there are a bunch of ads and menus on pages for a news article. This uses newspaper3k to get just the text of just the article.
|
||||
def getArticleText(url):
|
||||
article = Article(url)
|
||||
article.download()
|
||||
article.parse()
|
||||
return article.text
|
||||
|
||||
def get_summary(text):
|
||||
systemPrompt = "Write a concise summary of the text, return your responses with 5 lines that cover the key points of the text given."
|
||||
prompt = text
|
||||
|
||||
url = "http://localhost:11434/api/generate"
|
||||
|
||||
payload = {
|
||||
"model": "mistral-openorca",
|
||||
"prompt": prompt,
|
||||
"system": systemPrompt,
|
||||
"stream": False
|
||||
}
|
||||
payload_json = json.dumps(payload)
|
||||
headers = {"Content-Type": "application/json"}
|
||||
response = requests.post(url, data=payload_json, headers=headers)
|
||||
|
||||
return json.loads(response.text)["response"]
|
||||
|
||||
# Perform K-nearest neighbors (KNN) search
|
||||
def knn_search(question_embedding, embeddings, k=5):
|
||||
X = np.array([item['embedding'] for article in embeddings for item in article['embeddings']])
|
||||
source_texts = [item['source'] for article in embeddings for item in article['embeddings']]
|
||||
|
||||
# Fit a KNN model on the embeddings
|
||||
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
|
||||
knn.fit(X)
|
||||
|
||||
# Find the indices and distances of the k-nearest neighbors
|
||||
distances, indices = knn.kneighbors(question_embedding, n_neighbors=k)
|
||||
|
||||
# Get the indices and source texts of the best matches
|
||||
best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)]
|
||||
|
||||
return best_matches
|
Loading…
Reference in a new issue