Building an AI agent demo takes a weekend. Running one in production with real users and real stakes takes months of learning—usually the hard way.
Agent Reliability: Handling Failures
Agents fail in unpredictable ways. Plan for it:
class ResilientAgent {
private maxRetries = 3
private maxSteps = 50
private timeoutMs = 300000 // 5 minutes
async execute(task: string): Promise<AgentResult> {
const startTime = Date.now()
let steps = 0
let retries = 0
while (steps < this.maxSteps) {
// Timeout check
if (Date.now() - startTime > this.timeoutMs) {
return this.gracefulExit('Timeout exceeded', 'timeout')
}
try {
const action = await this.decideNextAction()
if (action.type === 'complete') {
return { success: true, result: action.result }
}
await this.executeAction(action)
steps++
retries = 0 // Reset on success
} catch (error) {
retries++
if (retries >= this.maxRetries) {
return this.gracefulExit(error.message, 'max_retries')
}
await this.sleep(Math.pow(2, retries) * 1000)
}
}
return this.gracefulExit('Max steps exceeded', 'max_steps')
}
private gracefulExit(reason: string, code: string): AgentResult {
return {
success: false,
partialResult: this.synthesizePartialResult(),
error: { reason, code },
steps: this.getStepHistory()
}
}
}Cost Management
Token costs spiral without careful management:
class CostTracker {
private tokenCounts = { input: 0, output: 0 }
private limits: { maxCostUsd: number; maxTokens: number }
recordUsage(inputTokens: number, outputTokens: number) {
this.tokenCounts.input += inputTokens
this.tokenCounts.output += outputTokens
}
getEstimatedCost(model: string): number {
const rates = MODEL_RATES[model]
return (
this.tokenCounts.input * rates.input +
this.tokenCounts.output * rates.output
) / 1_000_000
}
checkBudget(): { ok: boolean; remaining: number } {
const cost = this.getEstimatedCost(this.model)
return {
ok: cost < this.limits.maxCostUsd,
remaining: this.limits.maxCostUsd - cost
}
}
}
// Implement cost-aware routing
function selectModel(task: TaskAnalysis, budget: Budget): string {
if (task.complexity === 'simple' && budget.remaining < 0.10) {
return 'claude-4-haiku' // Cheapest option
}
if (task.requiresReasoning) {
return 'claude-4-sonnet' // Best value for complex tasks
}
return 'claude-4-haiku' // Default to cheap
}Observability: Logging Agent Decisions
interface AgentTrace {
traceId: string
task: string
startedAt: Date
steps: Array<{
stepNumber: number
thought: string
action: { tool: string; input: unknown }
result: string
tokensUsed: { input: number; output: number }
durationMs: number
timestamp: Date
}>
outcome: 'success' | 'failure' | 'timeout' | 'cancelled'
totalTokens: number
totalCostUsd: number
totalDurationMs: number
}
class AgentObserver {
async trace(agentRun: () => Promise<void>): Promise<AgentTrace> {
const trace: AgentTrace = {
traceId: crypto.randomUUID(),
startedAt: new Date(),
steps: [],
// ... initialize other fields
}
// Instrument the agent
this.agent.onStep((step) => {
trace.steps.push(step)
this.emit('step', { traceId: trace.traceId, step })
})
await agentRun()
// Calculate totals and persist
trace.totalDurationMs = Date.now() - trace.startedAt.getTime()
await this.persistTrace(trace)
return trace
}
}Building User Trust
// Show users what the agent is doing
interface AgentStatusUpdate {
phase: 'planning' | 'executing' | 'reviewing' | 'complete'
currentStep: string
progress: number // 0-100
canCancel: boolean
requiresApproval?: {
action: string
reason: string
}
}
// Stream updates to UI
async function* runAgentWithUpdates(task: string): AsyncGenerator<AgentStatusUpdate> {
yield { phase: 'planning', currentStep: 'Analyzing task...', progress: 0 }
const plan = await agent.plan(task)
for (let i = 0; i < plan.steps.length; i++) {
const step = plan.steps[i]
// High-risk actions require approval
if (step.riskLevel === 'high') {
yield {
phase: 'executing',
currentStep: step.description,
progress: (i / plan.steps.length) * 100,
requiresApproval: {
action: step.action,
reason: 'This action will modify data'
}
}
// Wait for approval...
}
yield {
phase: 'executing',
currentStep: step.description,
progress: ((i + 0.5) / plan.steps.length) * 100
}
await agent.executeStep(step)
}
yield { phase: 'complete', progress: 100 }
}Fallback Strategies
// Graceful degradation when agents fail
async function handleTask(task: string): Promise<Result> {
try {
// Try autonomous agent first
return await autonomousAgent.execute(task)
} catch (agentError) {
// Fallback to simpler approach
try {
return await simpleLLMCall(task)
} catch (llmError) {
// Ultimate fallback: queue for human
await queueForHuman(task, { agentError, llmError })
return { status: 'queued_for_human' }
}
}
}Conclusion
Production agents require reliability engineering: timeouts, retries, cost tracking, observability, user trust mechanisms, and graceful degradation. The patterns in this guide come from real failures. Learn from them before deploying your own agents.
