ollama_tools/test_model.py

import ollama
import pathlib
import time
import gc
import argparse
import threading
import sys

try:
    import pynvml
    _has_pynvml = True
except ImportError:
    print("Warnung: 'pynvml' Bibliothek nicht gefunden. GPU-Überwachung ist nicht verfügbar.", file=sys.stderr)
    _has_pynvml = False
except Exception as error:
    print(f"Warnung: Fehler beim Importieren/Initialisieren von NVML (GPU-Überwachung ist nicht verfügbar): {error}", file=sys.stderr)
    _has_pynvml = False


if _has_pynvml:
    try:
        pynvml.nvmlInit() # Muss nur einmal erfolgreich initialisiert werden
        pynvml.nvmlShutdown() # Und gleich wieder beendet werden, da der Monitor-Thread es selbst initialisiert
    except pynvml.NVMLError as error: # Fängt spezifische NVML-Fehler ab
        print(f"Warnung: NVML-Treiberfehler oder keine NVIDIA-GPU gefunden: {error}. GPU-Überwachung ist nicht verfügbar.", file=sys.stderr)
        _has_pynvml = False


# --- Konfiguration ---
OLLAMA_HOST = 'http://localhost:11434'
MODEL_NAME = 'llama3.2-vision' # <--- DIESE ZEILE MUSS DA SEIN!


# --- Globale Variable für die GPU-Überwachung ---
_gpu_monitor_active = False
_gpu_utilization_history = []

# --- Funktion zum Laden des Bildes als Bytes ---
def load_image_as_bytes(path):
    try:
        with open(path, 'rb') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Fehler: Bilddatei nicht gefunden unter {path}", file=sys.stderr)
        return None

# --- Funktion zur GPU-Überwachung (läuft in separatem Thread) ---
def gpu_monitor(interval_sec=1):
    if not _has_pynvml:
        return

    global _gpu_monitor_active, _gpu_utilization_history
    _gpu_utilization_history = [] # Zurücksetzen für jede neue Anfrage

    try:
        pynvml.nvmlInit() #
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
        print(f"\nStarte GPU-Überwachung (alle {interval_sec}s)...", file=sys.stderr)

        while _gpu_monitor_active:
            try:
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                _gpu_utilization_history.append(util.gpu)
                # print(f"GPU Utilization: {util.gpu}%", file=sys.stderr) # Zum Debuggen
            except pynvml.NVMLError as error:
                print(f"Fehler beim Abfragen der GPU-Auslastung: {error}", file=sys.stderr)
                break
            time.sleep(interval_sec)

    except pynvml.NVMLError as error:
        print(f"Fehler beim Initialisieren der NVML für Überwachung: {error}", file=sys.stderr)
    finally:
        if _gpu_monitor_active: # Nur nvmlShutdown aufrufen, wenn init erfolgreich war
            try:
                pynvml.nvmlShutdown()
                print("GPU-Überwachung beendet.", file=sys.stderr)
            except pynvml.NVMLError as error:
                print(f"Fehler beim Beenden der NVML: {error}", file=sys.stderr)

# --- Funktion zum Testen der LLM-Geschwindigkeit ---
def test_llm_performance(prompt_text, image_data=None):
    global _gpu_monitor_active, _gpu_utilization_history

    print(f"\n--- Starte LLM-Performance-Test ---")
    print(f"Modell: {MODEL_NAME}")
    print(f"Prompt: '{prompt_text}'")
    if image_data:
        print(f"Mit Bild: {len(image_data)} Bytes")

    start_full_process_time = time.time()

    response_text = ""
    generated_tokens = 0
    time_to_first_token = 0.0
    first_token_received = False

    # Starte den GPU-Überwachungs-Thread
    _gpu_monitor_active = True
    monitor_thread = threading.Thread(target=gpu_monitor, args=(0.5,)) # Überwacht alle 0.5 Sekunden
    monitor_thread.daemon = True # Thread wird beendet, wenn Hauptprogramm endet
    monitor_thread.start()

    try:
        messages = [{'role': 'user', 'content': prompt_text}]
        if image_data:
            messages[0]['images'] = [image_data]

        stream_response = ollama.chat(
            model=MODEL_NAME,
            messages=messages,
            options={
                'keep_alive': '0s',
                'temperature': 0.2,
                'num_predict': 400
            },
            stream=True
        )

        for chunk in stream_response:
            if not first_token_received:
                time_to_first_token = time.time() - start_full_process_time
                first_token_received = True

            if 'content' in chunk['message']:
                response_text += chunk['message']['content']

            if chunk.get('done'):
                generated_tokens = chunk['eval_count']
                load_duration = chunk.get('load_duration', 0) / 1_000_000_000
                prompt_eval_duration = chunk.get('prompt_eval_duration', 0) / 1_000_000_000
                eval_duration = chunk.get('eval_duration', 0) / 1_000_000_000
                total_duration_ollama = chunk.get('total_duration', 0) / 1_000_000_000

    except Exception as e:
        print(f"Fehler bei der Ollama-Anfrage: {e}", file=sys.stderr)
        return
    finally:
        # Beende den GPU-Überwachungs-Thread
        _gpu_monitor_active = False
        monitor_thread.join() # Warte, bis der Überwachungs-Thread beendet ist

    end_full_process_time = time.time()
    total_python_duration = end_full_process_time - start_full_process_time

    # --- Statistiken ausgeben ---
    print("\n--- Ergebnisse ---")
    print(f"Antwort (Auszug): '{response_text[:200]}...'")
    print(f"Generierte Tokens: {generated_tokens}")

    if 'total_duration_ollama' in locals() and total_duration_ollama > 0:
        print("\n--- Ollama Server-Statistiken (aus 'done' Chunk) ---")
        print(f"  Modellladezeit (load_duration): {load_duration:.3f} s")
        print(f"  Prompt-Evaluierungszeit (prompt_eval_duration): {prompt_eval_duration:.3f} s")
        print(f"  Antwortgenerierungszeit (eval_duration): {eval_duration:.3f} s")
        print(f"  **Ollama Server Gesamtzeit (total_duration): {total_duration_ollama:.3f} s**")
        print(f"  **Ollama Server Tokens/Sekunde: {generated_tokens / eval_duration:.2f} t/s**")
    else:
        print("\n--- Ollama Server-Statistiken nicht direkt verfügbar (Streaming-Modus) ---")

    print("\n--- Python-seitige Messungen ---")
    print(f"  **Time to First Token (TTFT): {time_to_first_token:.3f} s**")
    print(f"  **Gesamtdauer der Anfrage (Python-seitig): {total_python_duration:.3f} s**")

    if generated_tokens > 0 and total_python_duration > 0:
        python_tokens_per_second = generated_tokens / total_python_duration
        print(f"  **Tokens/Sekunde (Python-seitig, inkl. Netzwerk): {python_tokens_per_second:.2f} t/s**")
    else:
        print("  Tokens/Sekunde konnte nicht berechnet werden (keine Tokens generiert oder Zeit null).")

    # --- GPU-Auslastung Statistik ---
    if _has_pynvml and _gpu_utilization_history:
        avg_gpu_util = sum(_gpu_utilization_history) / len(_gpu_utilization_history)
        max_gpu_util = max(_gpu_utilization_history)
        print("\n--- GPU-Auslastung während der Anfrage ---")
        print(f"  Durchschnittliche GPU-Auslastung: {avg_gpu_util:.2f}%")
        print(f"  Maximale GPU-Auslastung: {max_gpu_util}%")
    elif not _has_pynvml:
        print("\n--- GPU-Auslastung konnte nicht gemessen werden (pynvml nicht verfügbar oder Fehler) ---")
    else:
        print("\n--- Keine GPU-Auslastungsdaten gesammelt (möglicherweise zu kurz oder Problem) ---")

    gc.collect()


# --- Hauptteil des Skripts mit Argument Parsing ---
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Test LLM performance with optional image input.')
    parser.add_argument('--image', type=str, help='Path to the image file for vision model testing (optional).')
    parser.add_argument('--text_prompt', type=str, default="Beschreibe das Bild so detailliert wie möglich.",
                        help='Text prompt to send to the LLM.')

    args = parser.parse_args()

    if args.image:
        test_image_bytes = load_image_as_bytes(args.image)
        if test_image_bytes:
            print(f"\nTeste Vision-Modell mit Bild: {args.image}")
            test_llm_performance(args.text_prompt, image_data=test_image_bytes)
        else:
            print("\nÜberspringe Vision-Test, da kein Bild geladen werden konnte.")
    else:
        print("\nKein Bildpfad angegeben. Teste nur Text-Prompt.")
        test_llm_performance(args.text_prompt)

    if not args.image:
        print("\nTeste zusätzlich einen reinen Text-Prompt (als Vergleich).")
        test_llm_performance("Erzähle mir einen kurzen Witz.")

    print("\nAlle Tests abgeschlossen.")