diff --git a/ollama/cmd/cli.py b/ollama/cmd/cli.py
index f2b53f5d..a7f0f6c1 100644
--- a/ollama/cmd/cli.py
+++ b/ollama/cmd/cli.py
@@ -111,7 +111,7 @@ def generate_oneshot(*args, **kwargs):
     spinner.start()
     spinner_running = True
     try:
-        for output in engine.generate(*args, **kwargs):
+        for output in engine.generate(model_name=kwargs.pop('model'), *args, **kwargs):
             choices = output.get("choices", [])
             if len(choices) > 0:
                 if spinner_running:
@@ -147,7 +147,7 @@ def generate_batch(*args, **kwargs):
 
 
 def pull(*args, **kwargs):
-    model.pull(*args, **kwargs)
+    model.pull(model_name=kwargs.pop('model'), *args, **kwargs)
 
 
 def run(*args, **kwargs):
diff --git a/ollama/cmd/server.py b/ollama/cmd/server.py
index fe803c17..d634babe 100644
--- a/ollama/cmd/server.py
+++ b/ollama/cmd/server.py
@@ -38,7 +38,7 @@ def serve(*args, **kwargs):
 
     app.update(
         {
-            "llms": {},
+            "models": {},
         }
     )
 
@@ -47,32 +47,32 @@ def serve(*args, **kwargs):
 
 async def load(request):
     body = await request.json()
-    model = body.get("model")
-    if not model:
+    name = body.get("model")
+    if not name:
         raise web.HTTPBadRequest()
 
     kwargs = {
-        "llms": request.app.get("llms"),
+        "models": request.app.get("models"),
     }
 
-    engine.load(model, **kwargs)
+    engine.load(name, **kwargs)
     return web.Response()
 
 
 async def unload(request):
     body = await request.json()
-    model = body.get("model")
-    if not model:
+    name = body.get("model")
+    if not name:
         raise web.HTTPBadRequest()
 
-    engine.unload(model, llms=request.app.get("llms"))
+    engine.unload(name, models=request.app.get("models"))
     return web.Response()
 
 
 async def generate(request):
     body = await request.json()
-    model = body.get("model")
-    if not model:
+    name = body.get("model")
+    if not name:
         raise web.HTTPBadRequest()
 
     prompt = body.get("prompt")
@@ -83,10 +83,10 @@ async def generate(request):
     await response.prepare(request)
 
     kwargs = {
-        "llms": request.app.get("llms"),
+        "models": request.app.get("models"),
     }
 
-    for output in engine.generate(model, prompt, **kwargs):
+    for output in engine.generate(name, prompt, **kwargs):
         output = json.dumps(output).encode('utf-8')
         await response.write(output)
         await response.write(b"\n")
diff --git a/ollama/engine.py b/ollama/engine.py
index 67c5cce9..aa82336f 100644
--- a/ollama/engine.py
+++ b/ollama/engine.py
@@ -4,8 +4,8 @@ from os import path
 from contextlib import contextmanager
 from llama_cpp import Llama as LLM
 
-import ollama.model
 import ollama.prompt
+from ollama.model import models_home
 
 
 @contextmanager
@@ -18,10 +18,7 @@ def suppress_stderr():
     os.dup2(stderr, sys.stderr.fileno())
 
 
-def generate(model, prompt, llms={}, *args, **kwargs):
-    llm = load(model, llms=llms)
-
-    prompt = ollama.prompt.template(model, prompt)
+def generate(model_name, prompt, models={}, *args, **kwargs):
     if "max_tokens" not in kwargs:
         kwargs.update({"max_tokens": 16384})
 
@@ -31,34 +28,32 @@ def generate(model, prompt, llms={}, *args, **kwargs):
     if "stream" not in kwargs:
         kwargs.update({"stream": True})
 
-    for output in llm(prompt, *args, **kwargs):
+    prompt = ollama.prompt.template(model_name, prompt)
+
+    model = load(model_name, models=models)
+    for output in model.create_completion(prompt, *args, **kwargs):
         yield output
 
 
-def load(model, llms={}):
-    llm = llms.get(model, None)
-    if not llm:
-        stored_model_path = path.join(ollama.model.models_home, model) + ".bin"
-        if path.exists(stored_model_path):
-            model_path = stored_model_path
-        else:
-            # try loading this as a path to a model, rather than a model name
-            model_path = path.abspath(model)
-
+def load(model_name, models={}):
+    model = models.get(model_name, None)
+    if not model:
+        model_path = path.expanduser(model_name)
         if not path.exists(model_path):
-            raise Exception(f"Model not found: {model}")
+            model_path = path.join(models_home, model_name + ".bin")
 
         try:
             # suppress LLM's output
             with suppress_stderr():
-                llm = LLM(model_path, verbose=False)
-                llms.update({model: llm})
-        except Exception as e:
+                model = LLM(model_path, verbose=False)
+                models.update({model_name: model})
+        except Exception:
             # e is sent to devnull, so create a generic exception
             raise Exception(f"Failed to load model: {model}")
-    return llm
+
+    return model
 
 
-def unload(model, llms={}):
-    if model in llms:
-        llms.pop(model)
+def unload(model_name, models={}):
+    if model_name in models:
+        models.pop(model_name)
diff --git a/ollama/prompt.py b/ollama/prompt.py
index e2bbea75..5e329e3e 100644
--- a/ollama/prompt.py
+++ b/ollama/prompt.py
@@ -1,16 +1,16 @@
-import os
+from os import path
 from difflib import SequenceMatcher
 from jinja2 import Environment, PackageLoader
 
 
-def template(model, prompt):
+def template(name, prompt):
     best_ratio = 0
     best_template = ''
 
     environment = Environment(loader=PackageLoader(__name__, 'templates'))
     for template in environment.list_templates():
-        base, _ = os.path.splitext(template)
-        ratio = SequenceMatcher(None, os.path.basename(model.lower()), base).ratio()
+        base, _ = path.splitext(template)
+        ratio = SequenceMatcher(None, path.basename(name).lower(), base).ratio()
         if ratio > best_ratio:
             best_ratio = ratio
             best_template = template