Audio Models
Infyr.AI provides comprehensive audio processing capabilities including speech-to-text transcription and text-to-speech synthesis. These models support various audio formats and languages for diverse audio applications.
Available Models
Speech-to-Text
Whisper-1 (whisper-1
)
Capabilities:
- Multi-language speech recognition
- Audio transcription with timestamps
- Audio translation to English
- Robust noise handling
- Multiple audio format support
Specifications:
- Supported Formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, webm
- Max File Size: 25MB
- Pricing: $6.6 per hour of audio processed
- Languages: 99+ languages supported
Basic Transcription
from openai import OpenAI
client = OpenAI(
base_url="https://api.infyr.ai/v1",
api_key="YOUR_INFYR_API_KEY"
)
# Transcribe audio file
with open("audio.mp3", "rb") as audio_file:
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
print("Transcription:", response)
import OpenAI from 'openai';
import fs from 'fs';
const openai = new OpenAI({
apiKey: 'YOUR_INFYR_API_KEY',
baseURL: 'https://api.infyr.ai/v1',
});
// Transcribe audio file
const audioFile = fs.createReadStream('audio.mp3');
const response = await openai.audio.transcriptions.create({
model: 'whisper-1',
file: audioFile,
response_format: 'text'
});
console.log('Transcription:', response);
curl -X POST "https://api.infyr.ai/v1/audio/transcriptions" \
-H "Authorization: Bearer YOUR_INFYR_API_KEY" \
-F "model=whisper-1" \
-F "[email protected]" \
-F "response_format=text"
Advanced Transcription with Options
# Transcription with additional parameters
with open("interview.wav", "rb") as audio_file:
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="json", # Get detailed response
language="en", # Specify language for better accuracy
prompt="This is an interview about artificial intelligence and machine learning.", # Context prompt
temperature=0.2 # Lower temperature for more consistent results
)
print("Transcription:", response.text)
print("Language:", response.language)
print("Duration:", response.duration)
Transcription with Timestamps
# Get transcription with word-level timestamps
with open("meeting.mp3", "rb") as audio_file:
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
timestamp_granularities=["word"]
)
print("Full transcription:", response.text)
# Print word-level timestamps
for word in response.words:
print(f"{word.word} ({word.start:.2f}s - {word.end:.2f}s)")
Audio Translation
# Translate non-English audio to English
with open("spanish_audio.mp3", "rb") as audio_file:
response = client.audio.translations.create(
model="whisper-1",
file=audio_file,
response_format="json"
)
print("English translation:", response.text)
Text-to-Speech
PlayAI TTS v3 (playai/tts/v3
)
Capabilities:
- High-quality speech synthesis
- Multiple voice options
- Natural-sounding audio generation
- Customizable voice characteristics
- Multiple output formats
Specifications:
- Pricing: $0.0006 per second of generated audio
- Output Formats: MP3, WAV, OGG
- Voice Options: Multiple preset voices available
Basic Text-to-Speech
import requests
import json
import time
# Generate speech from text
url = "https://api.infyr.ai/v1/audio/generations"
headers = {
"Authorization": "Bearer YOUR_API_KEY",
"Content-Type": "application/json"
}
data = {
"model": "playai/tts/v3",
"input": "Hello, welcome to Infyr.AI! We provide cutting-edge AI models for your applications.",
"voice": "Jennifer (English (US)/American)"
}
response = requests.post(url, headers=headers, json=data)
response_data = response.json()
print("Audio generation request ID:", response_data["request_id"])
print("Status:", response_data["status"])
# Check status and get result
request_id = response_data["request_id"]
while True:
status_url = f"https://api.infyr.ai/v1/audio/generations/{request_id}/status"
status_response = requests.get(status_url, headers=headers)
status_data = status_response.json()
if status_data["status"] == "completed":
# Get the audio result
result_url = f"https://api.infyr.ai/v1/audio/generations/{request_id}/result"
result_response = requests.get(result_url, headers=headers)
result_data = result_response.json()
audio_url = result_data["audio"]["url"]
print("Audio ready:", audio_url)
break
elif status_data["status"] == "failed":
print("Generation failed")
break
time.sleep(2) # Wait 2 seconds before checking again
HTTP API with cURL
# Generate speech from text
curl -X POST "https://api.infyr.ai/v1/audio/generations" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "playai/tts/v3",
"input": "Hello, welcome to Infyr.AI! We provide cutting-edge AI models for your applications.",
"voice": "Jennifer (English (US)/American)"
}'
# Response will include request_id, use it to check status
curl -X GET "https://api.infyr.ai/v1/audio/generations/{request_id}/status" \
-H "Authorization: Bearer YOUR_API_KEY"
# When status is "completed", get the result
curl -X GET "https://api.infyr.ai/v1/audio/generations/{request_id}/result" \
-H "Authorization: Bearer YOUR_API_KEY"
Advanced TTS with Custom Settings
// Generate speech with custom parameters
const generateAudio = async () => {
const response = await fetch('https://api.infyr.ai/v1/audio/generations', {
method: 'POST',
headers: {
'Authorization': 'Bearer YOUR_INFYR_API_KEY',
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'playai/tts/v3',
input: 'This is a sample text for high-quality speech synthesis with custom voice settings.',
voice: 'Jennifer (English (US)/American)',
response_format: 'url',
seed: 42 // For reproducible results
})
});
const responseData = await response.json();
console.log('Generation started:', responseData.request_id);
return responseData.request_id;
};
// Poll for completion
async function waitForCompletion(requestId) {
while (true) {
const statusResponse = await fetch(
`https://api.infyr.ai/v1/audio/generations/${requestId}/status`,
{
headers: {
'Authorization': 'Bearer YOUR_INFYR_API_KEY'
}
}
);
const status = await statusResponse.json();
if (status.status === 'completed') {
const resultResponse = await fetch(
`https://api.infyr.ai/v1/audio/generations/${requestId}/result`,
{
headers: {
'Authorization': 'Bearer YOUR_INFYR_API_KEY'
}
}
);
const result = await resultResponse.json();
return result.audio.url;
} else if (status.status === 'failed') {
throw new Error('Audio generation failed');
}
await new Promise(resolve => setTimeout(resolve, 1000));
}
}
// Usage
const requestId = await generateAudio();
const audioUrl = await waitForCompletion(requestId);
console.log('Audio available at:', audioUrl);
Use Case Examples
1. Meeting Transcription Service
import os
from pathlib import Path
class MeetingTranscriber:
def __init__(self, api_key):
self.client = OpenAI(
base_url="https://api.infyr.ai/v1",
api_key=api_key
)
def transcribe_meeting(self, audio_file_path, meeting_context=""):
"""Transcribe a meeting with speaker identification context"""
with open(audio_file_path, "rb") as audio_file:
response = self.client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
prompt=f"This is a business meeting. {meeting_context}",
temperature=0.1
)
return {
"transcript": response.text,
"language": response.language,
"duration": response.duration,
"segments": response.segments if hasattr(response, 'segments') else None
}
def generate_summary(self, transcript):
"""Generate a summary using a text model"""
summary_response = self.client.chat.completions.create(
model="lumo-8b",
messages=[
{
"role": "system",
"content": "You are a professional meeting summarizer. Create a concise summary with key points and action items."
},
{
"role": "user",
"content": f"Please summarize this meeting transcript:\n\n{transcript}"
}
],
max_tokens=500
)
return summary_response.choices[0].message.content
# Usage
transcriber = MeetingTranscriber("YOUR_API_KEY")
result = transcriber.transcribe_meeting(
"team_meeting.mp3",
"Participants discuss quarterly goals and project updates."
)
print("Transcript:", result["transcript"])
summary = transcriber.generate_summary(result["transcript"])
print("Summary:", summary)
2. Multilingual Content Creation
import requests
import json
def create_multilingual_audio_content(text, languages, api_key):
"""Create audio content in multiple languages"""
# First, translate the text to different languages
translated_content = {}
for lang_code, lang_name in languages.items():
# Translate using text model (assuming HTTP API for consistency)
chat_url = "https://api.infyr.ai/v1/chat/completions"
chat_headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
chat_data = {
"model": "lumo-8b",
"messages": [
{
"role": "system",
"content": f"You are a professional translator. Translate the following text to {lang_name}. Only return the translation."
},
{
"role": "user",
"content": text
}
],
"max_tokens": 200
}
translation_response = requests.post(chat_url, headers=chat_headers, json=chat_data)
translation_data = translation_response.json()
translated_text = translation_data["choices"][0]["message"]["content"]
translated_content[lang_code] = translated_text
# Generate audio for each translation using HTTP
tts_url = "https://api.infyr.ai/v1/audio/generations"
tts_headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
tts_data = {
"model": "playai/tts/v3",
"input": translated_text,
"voice": get_voice_for_language(lang_code)
}
tts_response = requests.post(tts_url, headers=tts_headers, json=tts_data)
tts_response_data = tts_response.json()
print(f"Generated audio for {lang_name}: {tts_response_data['request_id']}")
return translated_content
def get_voice_for_language(lang_code):
"""Map language codes to appropriate voices"""
voice_mapping = {
'en': 'Jennifer (English (US)/American)',
'es': 'Maria (Spanish/Spain)',
'fr': 'Claire (French/France)',
'de': 'Hans (German/Germany)'
}
return voice_mapping.get(lang_code, 'Jennifer (English (US)/American)')
# Create content in multiple languages
languages = {
'en': 'English',
'es': 'Spanish',
'fr': 'French'
}
content = create_multilingual_audio_content(
"Welcome to our AI-powered platform. Experience the future of technology today.",
languages,
"YOUR_API_KEY"
)
3. Podcast Processing Pipeline
import json
from datetime import datetime
class PodcastProcessor:
def __init__(self, api_key):
self.client = OpenAI(
base_url="https://api.infyr.ai/v1",
api_key=api_key
)
def process_podcast_episode(self, audio_file_path, episode_info):
"""Complete podcast processing pipeline"""
# Step 1: Transcribe the episode
print("Transcribing podcast episode...")
with open(audio_file_path, "rb") as audio_file:
transcript_response = self.client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
prompt="This is a podcast episode with multiple speakers discussing various topics."
)
# Step 2: Generate episode summary
print("Generating episode summary...")
summary_response = self.client.chat.completions.create(
model="deepseek-70b",
messages=[
{
"role": "system",
"content": "You are a podcast content analyzer. Create an engaging summary, extract key topics, and identify memorable quotes."
},
{
"role": "user",
"content": f"Analyze this podcast transcript and provide:\n1. Episode summary\n2. Key topics discussed\n3. Notable quotes\n4. Main takeaways\n\nTranscript:\n{transcript_response.text}"
}
],
max_tokens=1000
)
# Step 3: Generate show notes
print("Creating show notes...")
show_notes_response = self.client.chat.completions.create(
model="lumo-8b",
messages=[
{
"role": "system",
"content": "You are a professional podcast editor. Create detailed show notes with timestamps and topics."
},
{
"role": "user",
"content": f"Create structured show notes for this podcast:\n\n{transcript_response.text}"
}
],
max_tokens=800
)
# Step 4: Create audio highlights (optional)
# This would involve identifying key segments and generating shorter audio clips
return {
"episode_info": episode_info,
"transcript": transcript_response.text,
"duration": transcript_response.duration,
"language": transcript_response.language,
"summary": summary_response.choices[0].message.content,
"show_notes": show_notes_response.choices[0].message.content,
"processed_at": datetime.now().isoformat()
}
def save_results(self, results, output_file):
"""Save processing results to JSON file"""
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
# Usage
processor = PodcastProcessor("YOUR_API_KEY")
episode_info = {
"title": "AI in Healthcare",
"host": "Dr. Sarah Johnson",
"guest": "Dr. Michael Chen",
"episode_number": 42
}
results = processor.process_podcast_episode(
"podcast_episode_42.mp3",
episode_info
)
processor.save_results(results, "episode_42_processed.json")
4. Voice-Interactive Application
import asyncio
import aiohttp
import json
class VoiceAssistant:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.infyr.ai/v1"
self.conversation_history = []
async def process_voice_input(self, audio_file_path):
"""Process voice input and generate voice response"""
async with aiohttp.ClientSession() as session:
# Step 1: Transcribe user input
headers = {"Authorization": f"Bearer {self.api_key}"}
with open(audio_file_path, "rb") as audio_file:
form_data = aiohttp.FormData()
form_data.add_field('file', audio_file, filename='audio.mp3')
form_data.add_field('model', 'whisper-1')
form_data.add_field('response_format', 'text')
async with session.post(
f"{self.base_url}/audio/transcriptions",
headers=headers,
data=form_data
) as response:
transcript_data = await response.json()
user_message = transcript_data["text"].strip()
self.conversation_history.append({"role": "user", "content": user_message})
# Step 2: Generate response using chat model
chat_data = {
"model": "lumo-8b",
"messages": [
{"role": "system", "content": "You are a helpful voice assistant. Keep responses conversational and concise."},
*self.conversation_history[-10:] # Keep last 10 messages for context
],
"max_tokens": 200,
"temperature": 0.7
}
async with session.post(
f"{self.base_url}/chat/completions",
headers={**headers, "Content-Type": "application/json"},
json=chat_data
) as response:
chat_response = await response.json()
assistant_message = chat_response["choices"][0]["message"]["content"]
self.conversation_history.append({"role": "assistant", "content": assistant_message})
# Step 3: Generate voice response
tts_data = {
"model": "playai/tts/v3",
"input": assistant_message,
"voice": "Jennifer (English (US)/American)"
}
async with session.post(
f"{self.base_url}/audio/generations",
headers={**headers, "Content-Type": "application/json"},
json=tts_data
) as response:
tts_response = await response.json()
return {
"user_input": user_message,
"assistant_response": assistant_message,
"audio_request_id": tts_response["request_id"]
}
async def get_response_audio(self, request_id):
"""Get the generated audio response"""
headers = {"Authorization": f"Bearer {self.api_key}"}
async with aiohttp.ClientSession() as session:
while True:
async with session.get(
f"{self.base_url}/audio/generations/{request_id}/status",
headers=headers
) as response:
status = await response.json()
if status["status"] == "completed":
async with session.get(
f"{self.base_url}/audio/generations/{request_id}/result",
headers=headers
) as response:
result = await response.json()
return result["audio"]["url"]
elif status["status"] == "failed":
raise Exception("Audio generation failed")
await asyncio.sleep(1)
# Usage
assistant = VoiceAssistant("YOUR_API_KEY")
async def voice_conversation():
result = await assistant.process_voice_input("user_question.wav")
print("User said:", result["user_input"])
print("Assistant responds:", result["assistant_response"])
audio_url = await assistant.get_response_audio(result["audio_request_id"])
print("Response audio:", audio_url)
# Run the conversation
asyncio.run(voice_conversation())
Best Practices
Audio File Optimization
def optimize_audio_for_processing(input_file, output_file):
"""Prepare audio files for optimal processing"""
# Example using pydub (install with: pip install pydub)
from pydub import AudioSegment
# Load audio file
audio = AudioSegment.from_file(input_file)
# Optimize for Whisper
audio = audio.set_frame_rate(16000) # Whisper prefers 16kHz
audio = audio.set_channels(1) # Mono audio
audio = audio.normalize() # Normalize volume
# Remove silence from beginning and end
audio = audio.strip_silence()
# Export optimized version
audio.export(output_file, format="wav")
return output_file
Error Handling and Retry Logic
import time
from openai import OpenAI, APIError, RateLimitError
def robust_audio_processing(file_path, operation="transcribe", max_retries=3):
"""Process audio with robust error handling"""
for attempt in range(max_retries):
try:
with open(file_path, "rb") as audio_file:
if operation == "transcribe":
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="json"
)
elif operation == "translate":
response = client.audio.translations.create(
model="whisper-1",
file=audio_file,
response_format="json"
)
return response
except RateLimitError:
wait_time = (2 ** attempt) * 60 # Exponential backoff in minutes
print(f"Rate limit hit. Waiting {wait_time/60:.1f} minutes...")
time.sleep(wait_time)
except APIError as e:
print(f"API Error: {e}")
if attempt == max_retries - 1:
raise
time.sleep(30) # Wait 30 seconds before retry
except Exception as e:
print(f"Unexpected error: {e}")
if attempt == max_retries - 1:
raise
time.sleep(10)
raise Exception("Max retries exceeded")
# Usage
result = robust_audio_processing("audio_file.mp3", "transcribe")
Cost Optimization
def calculate_audio_processing_cost(file_path):
"""Calculate estimated cost for audio processing"""
from pydub import AudioSegment
# Get audio duration
audio = AudioSegment.from_file(file_path)
duration_seconds = len(audio) / 1000.0
duration_hours = duration_seconds / 3600.0
# Whisper pricing: $6.6 per hour
transcription_cost = duration_hours * 6.6
print(f"Audio duration: {duration_seconds:.1f} seconds ({duration_hours:.3f} hours)")
print(f"Estimated transcription cost: ${transcription_cost:.4f}")
return transcription_cost
# Check cost before processing
cost = calculate_audio_processing_cost("long_meeting.mp3")
if cost > 1.0: # Only process if cost is under $1
print("Cost too high, consider splitting the file")
else:
# Process the file
pass
Integration Examples
Flask Web Application
from flask import Flask, request, jsonify
import tempfile
import os
import requests
app = Flask(__name__)
API_KEY = "YOUR_INFYR_API_KEY"
BASE_URL = "https://api.infyr.ai/v1"
@app.route('/transcribe', methods=['POST'])
def transcribe_audio():
"""API endpoint for audio transcription"""
if 'audio' not in request.files:
return jsonify({"error": "No audio file provided"}), 400
audio_file = request.files['audio']
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
audio_file.save(temp_file.name)
try:
# Transcribe audio using HTTP API
headers = {"Authorization": f"Bearer {API_KEY}"}
with open(temp_file.name, 'rb') as f:
files = {"file": f}
data = {
"model": "whisper-1",
"response_format": "json"
}
response = requests.post(
f"{BASE_URL}/audio/transcriptions",
headers=headers,
files=files,
data=data
)
response_data = response.json()
result = {
"transcript": response_data.get("text", ""),
"language": response_data.get("language", ""),
"duration": response_data.get("duration", None)
}
return jsonify(result)
except Exception as e:
return jsonify({"error": str(e)}), 500
finally:
# Clean up temp file
os.unlink(temp_file.name)
@app.route('/synthesize', methods=['POST'])
def synthesize_speech():
"""API endpoint for text-to-speech"""
data = request.get_json()
text = data.get('text', '')
voice = data.get('voice', 'Jennifer (English (US)/American)')
try:
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
tts_data = {
"model": "playai/tts/v3",
"input": text,
"voice": voice
}
response = requests.post(
f"{BASE_URL}/audio/generations",
headers=headers,
json=tts_data
)
response_data = response.json()
return jsonify({
"request_id": response_data.get("request_id"),
"status": response_data.get("status")
})
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == '__main__':
app.run(debug=True)