initial commit

2025-08-12 00:26:04 +02:00
commit a9d01adb6d
3 changed files with 319 additions and 0 deletions
@@ -0,0 +1,38 @@
+<!DOCTYPE html>
+<html lang="de">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>LLM Interface (Ollama)</title>
+    <style>
+        body { font-family: sans-serif; margin: 20px; background-color: #f4f4f4; color: #333; }
+        .container { max-width: 800px; margin: 0 auto; background-color: #fff; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
+        h1 { color: #0056b3; text-align: center; margin-bottom: 30px; }
+        form { display: flex; flex-direction: column; gap: 15px; }
+        label { font-weight: bold; }
+        textarea { width: 100%; padding: 10px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; min-height: 100px; font-size: 16px; }
+        button { background-color: #007bff; color: white; padding: 12px 20px; border: none; border-radius: 5px; cursor: pointer; font-size: 18px; transition: background-color 0.3s ease; }
+        button:hover { background-color: #0056b3; }
+        .response-container { margin-top: 30px; padding: 20px; background-color: #e9ecef; border-left: 5px solid #007bff; border-radius: 4px; }
+        pre { white-space: pre-wrap; word-wrap: break-word; font-family: monospace; font-size: 16px; line-height: 1.5; }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Dein lokales LLM Interface (Ollama)</h1>
+
+        <form action="/generate" method="post">
+            <label for="prompt">Deine Anfrage an das LLM:</label>
+            <textarea id="prompt" name="prompt" placeholder="Stelle eine Frage oder gib eine Anweisung ein..." required>{{ prompt_value if prompt_value else '' }}</textarea>
+            <button type="submit">Generieren</button>
+        </form>
+
+        {% if response %}
+        <div class="response-container">
+            <h2>Antwort des LLM:</h2>
+            <pre>{{ response }}</pre>
+        </div>
+        {% endif %}
+    </div>
+</body>
+</html>
@@ -0,0 +1,209 @@
+import ollama
+import pathlib
+import time
+import gc
+import argparse
+import threading
+import sys
+
+try:
+    import pynvml 
+    _has_pynvml = True
+except ImportError:
+    print("Warnung: 'pynvml' Bibliothek nicht gefunden. GPU-Überwachung ist nicht verfügbar.", file=sys.stderr)
+    _has_pynvml = False
+except Exception as error: 
+    print(f"Warnung: Fehler beim Importieren/Initialisieren von NVML (GPU-Überwachung ist nicht verfügbar): {error}", file=sys.stderr)
+    _has_pynvml = False
+
+
+if _has_pynvml:
+    try:
+        pynvml.nvmlInit() # Muss nur einmal erfolgreich initialisiert werden
+        pynvml.nvmlShutdown() # Und gleich wieder beendet werden, da der Monitor-Thread es selbst initialisiert
+    except pynvml.NVMLError as error: # Fängt spezifische NVML-Fehler ab
+        print(f"Warnung: NVML-Treiberfehler oder keine NVIDIA-GPU gefunden: {error}. GPU-Überwachung ist nicht verfügbar.", file=sys.stderr)
+        _has_pynvml = False
+
+
+# --- Konfiguration ---
+OLLAMA_HOST = 'http://localhost:11434'
+MODEL_NAME = 'llama3.2-vision' # <--- DIESE ZEILE MUSS DA SEIN!
+
+
+# --- Globale Variable für die GPU-Überwachung ---
+_gpu_monitor_active = False
+_gpu_utilization_history = []
+
+# --- Funktion zum Laden des Bildes als Bytes ---
+def load_image_as_bytes(path):
+    try:
+        with open(path, 'rb') as f:
+            return f.read()
+    except FileNotFoundError:
+        print(f"Fehler: Bilddatei nicht gefunden unter {path}", file=sys.stderr)
+        return None
+
+# --- Funktion zur GPU-Überwachung (läuft in separatem Thread) ---
+def gpu_monitor(interval_sec=1):
+    if not _has_pynvml:
+        return
+
+    global _gpu_monitor_active, _gpu_utilization_history
+    _gpu_utilization_history = [] # Zurücksetzen für jede neue Anfrage
+    
+    try:
+        pynvml.nvmlInit() # 
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0) 
+        print(f"\nStarte GPU-Überwachung (alle {interval_sec}s)...", file=sys.stderr)
+        
+        while _gpu_monitor_active:
+            try:
+                util = pynvml.nvmlDeviceGetUtilizationRates(handle) 
+                _gpu_utilization_history.append(util.gpu)
+                # print(f"GPU Utilization: {util.gpu}%", file=sys.stderr) # Zum Debuggen
+            except pynvml.NVMLError as error: 
+                print(f"Fehler beim Abfragen der GPU-Auslastung: {error}", file=sys.stderr)
+                break
+            time.sleep(interval_sec)
+
+    except pynvml.NVMLError as error: 
+        print(f"Fehler beim Initialisieren der NVML für Überwachung: {error}", file=sys.stderr)
+    finally:
+        if _gpu_monitor_active: # Nur nvmlShutdown aufrufen, wenn init erfolgreich war
+            try:
+                pynvml.nvmlShutdown() 
+                print("GPU-Überwachung beendet.", file=sys.stderr)
+            except pynvml.NVMLError as error:
+                print(f"Fehler beim Beenden der NVML: {error}", file=sys.stderr)
+
+# --- Funktion zum Testen der LLM-Geschwindigkeit ---
+def test_llm_performance(prompt_text, image_data=None):
+    global _gpu_monitor_active, _gpu_utilization_history
+
+    print(f"\n--- Starte LLM-Performance-Test ---")
+    print(f"Modell: {MODEL_NAME}") 
+    print(f"Prompt: '{prompt_text}'")
+    if image_data:
+        print(f"Mit Bild: {len(image_data)} Bytes")
+
+    start_full_process_time = time.time() 
+
+    response_text = ""
+    generated_tokens = 0
+    time_to_first_token = 0.0
+    first_token_received = False
+    
+    # Starte den GPU-Überwachungs-Thread
+    _gpu_monitor_active = True
+    monitor_thread = threading.Thread(target=gpu_monitor, args=(0.5,)) # Überwacht alle 0.5 Sekunden
+    monitor_thread.daemon = True # Thread wird beendet, wenn Hauptprogramm endet
+    monitor_thread.start()
+
+    try:
+        messages = [{'role': 'user', 'content': prompt_text}]
+        if image_data:
+            messages[0]['images'] = [image_data]
+
+        stream_response = ollama.chat(
+            model=MODEL_NAME, 
+            messages=messages,
+            options={
+                'keep_alive': '0s',
+                'temperature': 0.2,
+                'num_predict': 400
+            },
+            stream=True
+        )
+
+        for chunk in stream_response:
+            if not first_token_received:
+                time_to_first_token = time.time() - start_full_process_time
+                first_token_received = True
+
+            if 'content' in chunk['message']:
+                response_text += chunk['message']['content']
+
+            if chunk.get('done'):
+                generated_tokens = chunk['eval_count']
+                load_duration = chunk.get('load_duration', 0) / 1_000_000_000
+                prompt_eval_duration = chunk.get('prompt_eval_duration', 0) / 1_000_000_000
+                eval_duration = chunk.get('eval_duration', 0) / 1_000_000_000
+                total_duration_ollama = chunk.get('total_duration', 0) / 1_000_000_000
+
+    except Exception as e:
+        print(f"Fehler bei der Ollama-Anfrage: {e}", file=sys.stderr)
+        return
+    finally:
+        # Beende den GPU-Überwachungs-Thread
+        _gpu_monitor_active = False
+        monitor_thread.join() # Warte, bis der Überwachungs-Thread beendet ist
+
+    end_full_process_time = time.time()
+    total_python_duration = end_full_process_time - start_full_process_time
+
+    # --- Statistiken ausgeben ---
+    print("\n--- Ergebnisse ---")
+    print(f"Antwort (Auszug): '{response_text[:200]}...'")
+    print(f"Generierte Tokens: {generated_tokens}")
+
+    if 'total_duration_ollama' in locals() and total_duration_ollama > 0:
+        print("\n--- Ollama Server-Statistiken (aus 'done' Chunk) ---")
+        print(f"  Modellladezeit (load_duration): {load_duration:.3f} s")
+        print(f"  Prompt-Evaluierungszeit (prompt_eval_duration): {prompt_eval_duration:.3f} s")
+        print(f"  Antwortgenerierungszeit (eval_duration): {eval_duration:.3f} s")
+        print(f"  **Ollama Server Gesamtzeit (total_duration): {total_duration_ollama:.3f} s**")
+        print(f"  **Ollama Server Tokens/Sekunde: {generated_tokens / eval_duration:.2f} t/s**")
+    else:
+        print("\n--- Ollama Server-Statistiken nicht direkt verfügbar (Streaming-Modus) ---")
+
+    print("\n--- Python-seitige Messungen ---")
+    print(f"  **Time to First Token (TTFT): {time_to_first_token:.3f} s**")
+    print(f"  **Gesamtdauer der Anfrage (Python-seitig): {total_python_duration:.3f} s**")
+    
+    if generated_tokens > 0 and total_python_duration > 0:
+        python_tokens_per_second = generated_tokens / total_python_duration
+        print(f"  **Tokens/Sekunde (Python-seitig, inkl. Netzwerk): {python_tokens_per_second:.2f} t/s**")
+    else:
+        print("  Tokens/Sekunde konnte nicht berechnet werden (keine Tokens generiert oder Zeit null).")
+
+    # --- GPU-Auslastung Statistik ---
+    if _has_pynvml and _gpu_utilization_history:
+        avg_gpu_util = sum(_gpu_utilization_history) / len(_gpu_utilization_history)
+        max_gpu_util = max(_gpu_utilization_history)
+        print("\n--- GPU-Auslastung während der Anfrage ---")
+        print(f"  Durchschnittliche GPU-Auslastung: {avg_gpu_util:.2f}%")
+        print(f"  Maximale GPU-Auslastung: {max_gpu_util}%")
+    elif not _has_pynvml:
+        print("\n--- GPU-Auslastung konnte nicht gemessen werden (pynvml nicht verfügbar oder Fehler) ---")
+    else:
+        print("\n--- Keine GPU-Auslastungsdaten gesammelt (möglicherweise zu kurz oder Problem) ---")
+
+    gc.collect()
+
+
+# --- Hauptteil des Skripts mit Argument Parsing ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Test LLM performance with optional image input.')
+    parser.add_argument('--image', type=str, help='Path to the image file for vision model testing (optional).')
+    parser.add_argument('--text_prompt', type=str, default="Beschreibe das Bild so detailliert wie möglich.",
+                        help='Text prompt to send to the LLM.')
+    
+    args = parser.parse_args()
+
+    if args.image:
+        test_image_bytes = load_image_as_bytes(args.image)
+        if test_image_bytes:
+            print(f"\nTeste Vision-Modell mit Bild: {args.image}")
+            test_llm_performance(args.text_prompt, image_data=test_image_bytes)
+        else:
+            print("\nÜberspringe Vision-Test, da kein Bild geladen werden konnte.")
+    else:
+        print("\nKein Bildpfad angegeben. Teste nur Text-Prompt.")
+        test_llm_performance(args.text_prompt)
+
+    if not args.image:
+        print("\nTeste zusätzlich einen reinen Text-Prompt (als Vergleich).")
+        test_llm_performance("Erzähle mir einen kurzen Witz.")
+
+    print("\nAlle Tests abgeschlossen.")
@@ -0,0 +1,72 @@
+from fastapi import FastAPI, Request, Form
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+import requests
+import json
+import argparse
+import uvicorn  # Diese Import-Zeile ist wichtig!
+
+app = FastAPI()
+templates = Jinja2Templates(directory="templates")
+
+OLLAMA_API_URL = "http://localhost:11434/api/generate"  # Passe dies bei Bedarf an deine Ollama-URL an
+
+
+@app.get("/", response_class=HTMLResponse)
+async def read_root(request: Request):
+    """
+    Zeigt das Haupt-Webinterface mit dem Eingabefeld an.
+    """
+    return templates.TemplateResponse(
+        "index.html", {"request": request, "response": None}
+    )
+
+
+@app.post("/generate", response_class=HTMLResponse)
+async def generate_text(request: Request, prompt: str = Form(...)):
+    """
+    Verarbeitet die Anfrage, sendet sie an Ollama und zeigt die Antwort an.
+    """
+    headers = {"Content-Type": "application/json"}
+    payload = {
+        "model": "llama3.2-vision",  # Passe dies an das Modell an, das du mit Ollama verwendest (z.B. "mistral", "phi3")
+        "prompt": prompt,
+        "stream": False,  # Setze dies auf True, wenn du gestreamte Antworten verarbeiten möchtest
+    }
+
+    response_text = "Fehler bei der Kommunikation mit dem LLM."
+    try:
+        ollama_response = requests.post(
+            OLLAMA_API_URL, headers=headers, data=json.dumps(payload)
+        )
+        ollama_response.raise_for_status()  # Löst einen HTTPError für schlechte Antworten (4xx oder 5xx) aus
+
+        data = ollama_response.json()
+        if "response" in data:
+            response_text = data["response"]
+        else:
+            response_text = f"Unerwartete Antwort von Ollama: {data}"
+    except requests.exceptions.ConnectionError:
+        response_text = "Konnte keine Verbindung zum Ollama-Server herstellen. Stelle sicher, dass er läuft und unter der richtigen Adresse erreichbar ist."
+    except requests.exceptions.RequestException as e:
+        response_text = f"Ein Fehler ist aufgetreten: {e}"
+
+    return templates.TemplateResponse(
+        "index.html",
+        {"request": request, "response": response_text, "prompt_value": prompt},
+    )
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-p", 
+                        "--port", 
+                        dest="port", 
+                        help="The port to use. Default: 8000",
+                        type=int, 
+                        default=8000
+                       )
+    args = parser.parse_args()
+    uvicorn.run(app, host="0.0.0.0", port=args.port)