AI Agents in Production: Lessons from Real Deployments

Building an AI agent demo takes a weekend. Running one in production with real users and real stakes takes months of learning—usually the hard way.

Agent Reliability: Handling Failures

Agents fail in unpredictable ways. Plan for it:

class ResilientAgent {
  private maxRetries = 3
  private maxSteps = 50
  private timeoutMs = 300000  // 5 minutes
  
  async execute(task: string): Promise<AgentResult> {
    const startTime = Date.now()
    let steps = 0
    let retries = 0
    
    while (steps < this.maxSteps) {
      // Timeout check
      if (Date.now() - startTime > this.timeoutMs) {
        return this.gracefulExit('Timeout exceeded', 'timeout')
      }
      
      try {
        const action = await this.decideNextAction()
        
        if (action.type === 'complete') {
          return { success: true, result: action.result }
        }
        
        await this.executeAction(action)
        steps++
        retries = 0  // Reset on success
        
      } catch (error) {
        retries++
        if (retries >= this.maxRetries) {
          return this.gracefulExit(error.message, 'max_retries')
        }
        await this.sleep(Math.pow(2, retries) * 1000)
      }
    }
    
    return this.gracefulExit('Max steps exceeded', 'max_steps')
  }
  
  private gracefulExit(reason: string, code: string): AgentResult {
    return {
      success: false,
      partialResult: this.synthesizePartialResult(),
      error: { reason, code },
      steps: this.getStepHistory()
    }
  }
}

Cost Management

Token costs spiral without careful management:

class CostTracker {
  private tokenCounts = { input: 0, output: 0 }
  private limits: { maxCostUsd: number; maxTokens: number }
  
  recordUsage(inputTokens: number, outputTokens: number) {
    this.tokenCounts.input += inputTokens
    this.tokenCounts.output += outputTokens
  }
  
  getEstimatedCost(model: string): number {
    const rates = MODEL_RATES[model]
    return (
      this.tokenCounts.input * rates.input +
      this.tokenCounts.output * rates.output
    ) / 1_000_000
  }
  
  checkBudget(): { ok: boolean; remaining: number } {
    const cost = this.getEstimatedCost(this.model)
    return {
      ok: cost < this.limits.maxCostUsd,
      remaining: this.limits.maxCostUsd - cost
    }
  }
}

// Implement cost-aware routing
function selectModel(task: TaskAnalysis, budget: Budget): string {
  if (task.complexity === 'simple' && budget.remaining < 0.10) {
    return 'claude-4-haiku'  // Cheapest option
  }
  if (task.requiresReasoning) {
    return 'claude-4-sonnet'  // Best value for complex tasks
  }
  return 'claude-4-haiku'  // Default to cheap
}

Observability: Logging Agent Decisions

interface AgentTrace {
  traceId: string
  task: string
  startedAt: Date
  steps: Array<{
    stepNumber: number
    thought: string
    action: { tool: string; input: unknown }
    result: string
    tokensUsed: { input: number; output: number }
    durationMs: number
    timestamp: Date
  }>
  outcome: 'success' | 'failure' | 'timeout' | 'cancelled'
  totalTokens: number
  totalCostUsd: number
  totalDurationMs: number
}

class AgentObserver {
  async trace(agentRun: () => Promise<void>): Promise<AgentTrace> {
    const trace: AgentTrace = {
      traceId: crypto.randomUUID(),
      startedAt: new Date(),
      steps: [],
      // ... initialize other fields
    }
    
    // Instrument the agent
    this.agent.onStep((step) => {
      trace.steps.push(step)
      this.emit('step', { traceId: trace.traceId, step })
    })
    
    await agentRun()
    
    // Calculate totals and persist
    trace.totalDurationMs = Date.now() - trace.startedAt.getTime()
    await this.persistTrace(trace)
    
    return trace
  }
}

Building User Trust

// Show users what the agent is doing
interface AgentStatusUpdate {
  phase: 'planning' | 'executing' | 'reviewing' | 'complete'
  currentStep: string
  progress: number  // 0-100
  canCancel: boolean
  requiresApproval?: {
    action: string
    reason: string
  }
}

// Stream updates to UI
async function* runAgentWithUpdates(task: string): AsyncGenerator<AgentStatusUpdate> {
  yield { phase: 'planning', currentStep: 'Analyzing task...', progress: 0 }
  
  const plan = await agent.plan(task)
  
  for (let i = 0; i < plan.steps.length; i++) {
    const step = plan.steps[i]
    
    // High-risk actions require approval
    if (step.riskLevel === 'high') {
      yield {
        phase: 'executing',
        currentStep: step.description,
        progress: (i / plan.steps.length) * 100,
        requiresApproval: {
          action: step.action,
          reason: 'This action will modify data'
        }
      }
      // Wait for approval...
    }
    
    yield {
      phase: 'executing',
      currentStep: step.description,
      progress: ((i + 0.5) / plan.steps.length) * 100
    }
    
    await agent.executeStep(step)
  }
  
  yield { phase: 'complete', progress: 100 }
}

Fallback Strategies

// Graceful degradation when agents fail
async function handleTask(task: string): Promise<Result> {
  try {
    // Try autonomous agent first
    return await autonomousAgent.execute(task)
  } catch (agentError) {
    // Fallback to simpler approach
    try {
      return await simpleLLMCall(task)
    } catch (llmError) {
      // Ultimate fallback: queue for human
      await queueForHuman(task, { agentError, llmError })
      return { status: 'queued_for_human' }
    }
  }
}

Conclusion

Production agents require reliability engineering: timeouts, retries, cost tracking, observability, user trust mechanisms, and graceful degradation. The patterns in this guide come from real failures. Learn from them before deploying your own agents.