Media Generation

Generate images, video, audio, and music with a unified API backed by pluggable media providers across Python, TypeScript, and Go.

Generate images, audio, video, and music from your agents — same method pattern across Python, TypeScript, and Go.

Agents that generate reports, product listings, marketing content, or customer-facing assets often need more than text. AgentField's media generation API lets you create images, narrate text, produce video, generate music, and transcribe audio through a unified interface backed by pluggable providers like fal.ai, OpenRouter, DALL-E, and ElevenLabs.

from agentfield import Agent, AIConfig

app = Agent(
    node_id="product-listing-generator",
    ai_config=AIConfig(
        fal_api_key="your-fal-key",           # or FAL_KEY env var
        openrouter_api_key="your-or-key",      # or OPENROUTER_API_KEY env var
    ),
)

@app.reasoner()
async def generate_product_listing(product: dict) -> dict:
    # Generate a product image via OpenRouter (Gemini)
    image = await app.ai_generate_image(
        prompt=f"Professional product photo: {product['name']}",
        model="openrouter/google/gemini-2.5-flash-image-preview",
    )
    image.images[0].save(f"/output/{product['id']}_hero.png")

    # Generate a short product demo video via OpenRouter
    video = await app.ai_generate_video(
        prompt=f"Product demonstration: {product['name']} in use",
        model="openrouter/kling-video/v2.0/master",
        duration=10,
    )
    if video.videos:
        video.videos[0].save(f"/output/{product['id']}_demo.mp4")

    # Generate audio narration
    audio = await app.ai_generate_audio(
        text=f"Introducing {product['name']}: {product['description']}",
        model="openrouter/openai/tts-1",
        voice="alloy",
    )
    if audio.audio:
        audio.audio.save(f"/output/{product['id']}_narration.wav")

    # Generate background music
    music = await app.ai_generate_music(
        prompt="Upbeat corporate background music for product video",
        model="openrouter/google/lyria-3-pro",
        duration=30,
    )
    if music.audio:
        music.audio.save(f"/output/{product['id']}_bgm.wav")

    return {
        "image_url": image.images[0].url if image.images else None,
        "video_url": video.videos[0].url if video.videos else None,
        "audio_url": audio.audio.url if audio.audio else None,
    }

import { Agent } from "@agentfield/sdk";
import { OpenRouterMediaProvider, MediaRouter } from "@agentfield/sdk";

const agent = new Agent({ nodeId: "media-agent" });

// Set up media provider
const media = new OpenRouterMediaProvider();  // reads OPENROUTER_API_KEY env var

agent.reasoner("generateAssets", async (ctx) => {
  // Generate product image
  const image = await media.generateImage({
    prompt: `Professional product photo: ${ctx.input.name}`,
    model: "google/gemini-2.5-flash-image-preview",
  });
  console.log(`Generated ${image.images.length} image(s)`);

  // Generate product demo video (async polling)
  const video = await media.generateVideo({
    prompt: `Product demo: ${ctx.input.name} in use`,
    model: "kling-video/v2.0/master",
    duration: 10,
    timeout: 600_000,
  });
  console.log(`Video: ${video.videos?.[0]?.url}`);

  // Generate audio narration (SSE streaming)
  const audio = await media.generateAudio({
    text: `Introducing ${ctx.input.name}`,
    model: "openai/tts-1",
    voice: "alloy",
  });

  return {
    imageUrl: image.images[0]?.url,
    videoUrl: video.videos?.[0]?.url,
    hasAudio: !!audio.audio,
  };
});

agent.serve();

package main

import (
    "context"
    "fmt"
    "log"
    "os"

    "github.com/Agent-Field/agentfield/sdk/go/agent"
    goai "github.com/Agent-Field/agentfield/sdk/go/ai"
)

func main() {
    app, _ := agent.New(agent.Config{
        NodeID:  "media-agent",
        Version: "1.0.0",
    })

    // Set up media provider
    media, err := goai.NewOpenRouterMediaProvider(os.Getenv("OPENROUTER_API_KEY"))
    if err != nil {
        log.Fatal(err)
    }

    app.RegisterReasoner("generateAssets", func(ctx context.Context, input map[string]any) (any, error) {
        name, _ := input["name"].(string)

        // Generate product image
        image, err := media.GenerateImage(ctx, goai.ImageRequest{
            Prompt: fmt.Sprintf("Professional product photo: %s", name),
            Model:  "google/gemini-2.5-flash-image-preview",
        })
        if err != nil {
            return nil, err
        }

        // Generate demo video (async polling)
        video, err := media.GenerateVideo(ctx, goai.VideoRequest{
            Prompt:   fmt.Sprintf("Product demo: %s in use", name),
            Model:    "kling-video/v2.0/master",
            Duration: 10,
        })
        if err != nil {
            return nil, err
        }

        // Generate audio narration (SSE streaming)
        audio, err := media.GenerateAudio(ctx, goai.AudioRequest{
            Text:  fmt.Sprintf("Introducing %s", name),
            Model: "openai/tts-1",
            Voice: "alloy",
        })
        if err != nil {
            return nil, err
        }

        return map[string]any{
            "images": len(image.Images),
            "video":  len(video.Videos) > 0,
            "audio":  audio.Audio != nil,
        }, nil
    })

    app.Serve(context.Background())
}

What you get back

Every SDK returns typed response objects with saveable files and URLs — not raw provider responses you have to normalize yourself.

{
  "image_url": "https://cdn.example.com/product_hero.webp",
  "video_url": "https://cdn.example.com/product_demo.mp4",
  "audio_url": "https://cdn.example.com/product_narration.wav",
  "music_url": "https://cdn.example.com/background_music.wav"
}

What you get back

What you get

MediaRouter — prefix-based provider dispatch

Media providers

Output types

Patterns

SDK reference

Video generation details

Music generation (Python)

On this page