update voicebot-rt

This commit is contained in:
moeny-matt 2024-12-10 14:47:02 -05:00
parent f7ce69f5b5
commit e16f0eaea6
6 changed files with 660 additions and 3 deletions

View File

@ -0,0 +1,18 @@
FROM node:slim
WORKDIR /usr/src/app
COPY package*.json ./
RUN npm install --omit=dev
RUN npm install -g javascript-obfuscator
COPY index.js index.js
COPY utils.js utils.js
COPY prices.json prices.json
RUN javascript-obfuscator index.js --output index.js
RUN javascript-obfuscator utils.js --output utils.js
EXPOSE 3001
CMD ["node", "index.js"]

View File

@ -0,0 +1,367 @@
// ====================================
// Voice Bot Implementation
// A real-time voice chat system that converts speech to text,
// processes it through AI (OpenAI/Botpress), and converts responses back to speech
// ====================================
// Required Dependencies
const net = require('net');
const { SpeechClient } = require('@google-cloud/speech');
const { TextToSpeechClient } = require('@google-cloud/text-to-speech');
const { Transform } = require('stream');
const { OpenAI } = require('openai');
const util = require('util');
const async = require('async');
const { encoding_for_model } = require('tiktoken');
const axios = require('axios');
const { toUUID } = require('./utils');
const packageInfo = require('./package.json');
const prices = require('./prices.json');
const {
matchesSentenceEnding,
removeSpecialCharacters,
calculateMetricsAndPricing
} = require('./utils');
// Load environment variables
require('dotenv').config();
// ====================================
// Packet Protocol Definition
// Defines the types of packets that can be exchanged between client and server
// ====================================
const PACKET_TYPES = {
'TERMINATE': 0x0, // Signal to end the connection
'UUID': 0x1, // Client identification
'AUDIO': 0x10, // Audio data packet
'ERROR': 0xff // Error notification
};
// ====================================
// Packet Handler
// Processes incoming packets based on their type
// ====================================
function handlePacket(socket, audioStream, packet) {
const packetType = packet.readUInt8(0);
const packetLength = packet.readUInt16BE(1);
switch (packetType) {
case PACKET_TYPES.TERMINATE:
console.log('Terminate packet received. Closing connection.');
socket.kill();
break;
case PACKET_TYPES.UUID:
const uuid = toUUID(packet.slice(3, 19).toString('hex'));
socket.uuid = uuid;
console.log('UUID packet received: ' + uuid);
break;
case PACKET_TYPES.AUDIO:
const audioData = packet.slice(3, 3 + packetLength);
audioStream.write(audioData);
break;
case PACKET_TYPES.ERROR:
const errorCode = packetLength > 0 ? packet.readUInt8(3) : null;
console.log('Error packet received with code: ' + errorCode);
break;
default:
console.log('Unknown packet type: ' + packetType);
}
}
// ====================================
// Main Server Implementation
// Creates and manages the TCP server that handles client connections
// ====================================
const server = net.createServer(async socket => {
console.log('Client connected');
// State variables
let assistant = null;
let messages = [];
let totalCost = 0;
let isProcessing = false;
let isAssistantRunning = false;
// Initialize AI clients
const ttsClient = new TextToSpeechClient();
const speechClient = new SpeechClient();
// Initialize OpenAI if API key is provided
let openai = null;
if (process.env.OPENAI_API_KEY) {
openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
}
// Initialize Botpress if webhook URL is provided
let botpress = null;
if (process.env.BOTPRESS_WEBHOOK_URL) {
openai = null; // Disable OpenAI if using Botpress
botpress = axios.create({ baseURL: BOTPRESS_WEBHOOK_URL });
}
// Create OpenAI assistant if ID is provided
if (openai && process.env.OPENAI_ASSISTANT_ID) {
assistant = await openai.beta.assistants.create();
}
// ====================================
// Message Handler
// Tracks messages and calculates costs
// ====================================
const handleMessage = (socket, role, content) => {
const messageData = {
uuid: socket.uuid || '',
role: role,
content: content,
...calculateMetricsAndPricing(role, content)
};
// Update costs based on message role
switch (role) {
case 'system':
totalCost += messageData.costByCharacter;
break;
case 'user':
totalCost += messageData.costBySecond;
totalCost += messageData.costByToken;
console.log(messageData);
break;
case 'assistant':
totalCost += messageData.costByCharacter;
totalCost += messageData.costByToken;
console.log(messageData);
break;
}
messages.push(messageData);
};
// ====================================
// Text-to-Speech Handler
// Converts text responses to speech and streams audio back to client
// ====================================
const ttsQueue = async.queue(async task => {
const { message } = task;
const request = {
input: { text: message },
voice: {
languageCode: process.env.TEXT_TO_SPEECH_LANGUAGE || 'en-US',
ssmlGender: process.env.TEXT_TO_SPEECH_GENDER || 'FEMALE',
name: process.env.TEXT_TO_SPEECH_NAME || 'en-US-Journey-F'
},
audioConfig: {
audioEncoding: 'LINEAR16',
sampleRateHertz: 8000,
speakingRate: 1
}
};
try {
const [response] = await ttsClient.synthesizeSpeech(request);
handleMessage(socket, 'assistant', message);
const audioContent = response.audioContent;
const chunkSize = 320;
// Stream audio in chunks
for (let i = 0; i < audioContent.length; i += chunkSize) {
if (!isProcessing) break;
const chunk = audioContent.slice(i, i + chunkSize);
const header = Buffer.alloc(3);
header.writeUInt8(PACKET_TYPES.AUDIO, 0);
header.writeUInt16BE(chunk.length, 1);
const packet = Buffer.concat([header, chunk]);
socket.write(packet);
await new Promise(resolve => setTimeout(resolve, 20));
}
} catch (error) {
console.error('Error synthesizing speech:', error);
}
}, 1);
ttsQueue.drain(() => {});
// ====================================
// Speech-to-Text Setup
// Configures and manages speech recognition
// ====================================
/* Start addition to check audio level */
let audioSampleWindow = [];
const WINDOW_SIZE = 10; // Number of chunks to analyze
const AUDIO_THRESHOLD = 700; // Adjust this threshold based on testing
/* End addition to check audio level */
const audioStream = new Transform({
transform(chunk, encoding, callback) {
/* Start addition to check audio level */
// Calculate RMS (Root Mean Square) of the audio chunk
let sum = 0;
for (let i = 0; i < chunk.length; i += 2) {
// Convert 2 bytes to a 16-bit integer
const sample = chunk.readInt16LE(i);
sum += sample * sample;
}
const rms = Math.sqrt(sum / (chunk.length / 2));
// Maintain a sliding window of audio levels
audioSampleWindow.push(rms);
if (audioSampleWindow.length > WINDOW_SIZE) {
audioSampleWindow.shift();
}
// Calculate average RMS over the window
const avgRMS = audioSampleWindow.reduce((a, b) => a + b, 0) / audioSampleWindow.length;
// Set isProcessing based on audio level
if (avgRMS > AUDIO_THRESHOLD) {
isProcessing = false;
console.log('Audio level triggered:', avgRMS.toFixed(2));
}
/* End addition to check audio level */
callback(null, chunk);
}
});
// Add system prompt to messages
handleMessage(
socket,
'system',
process.env.SYSTEM_PROMPT || 'You are a helpful assistant.'
);
// Configure speech recognition
const recognitionConfig = {
config: {
encoding: 'LINEAR16',
sampleRateHertz: 8000,
languageCode: process.env.SPEECH_RECOGNITION_LANGUAGE || 'en-US',
model: process.env.SPEECH_RECOGNITION_MODEL || 'phone_call',
useEnhanced: true
},
interimResults: false
};
// Add alternative languages if specified
if (process.env.SPEECH_RECOGNITION_ALTERNATIVE_LANGUAGES) {
recognitionConfig.alternativeLanguageCodes =
process.env.SPEECH_RECOGNITION_ALTERNATIVE_LANGUAGES.split(',');
}
// ====================================
// Speech Recognition Stream Handler
// Processes speech recognition results and manages AI responses
// ====================================
const recognizeStream = speechClient
.streamingRecognize(recognitionConfig)
.on('error', console.error)
.on('data', async data => {
try {
if (
data.results[0]?.alternatives[0]?.transcript &&
data.results[0]?.alternatives[0]?.confidence > 0
) {
const transcript = data.results[0].alternatives[0].transcript.trim();
if (transcript) {
let response;
isProcessing = false;
handleMessage(socket, 'user', transcript);
if (openai) {
if (process.env.OPENAI_ASSISTANT_ID) {
// Wait if assistant is still processing
while (isAssistantRunning) {
console.log('Assistant is running...');
await new Promise(resolve => setTimeout(resolve, 1000));
}
// Process with OpenAI Assistant
await openai.beta.assistants.messages.create(
assistant.id,
{ role: 'user', content: transcript }
);
isAssistantRunning = true;
response = await openai.beta.assistants.runs.create(
assistant.id,
{
assistant_id: process.env.OPENAI_ASSISTANT_ID,
stream: true
}
);
} else {
// Process with standard OpenAI chat
response = await openai.chat.completions.create({
model: process.env.OPENAI_MODEL || 'gpt-3.5-turbo',
messages: messages,
max_tokens: 150,
stream: true
});
}
let currentResponse = '';
for await (const chunk of response) {
let content = '';
if (process.env.OPENAI_ASSISTANT_ID) {
if (Array.isArray(chunk.data.delta?.content) &&
chunk.data.delta.content[0]?.text) {
content = chunk.data.delta.content[0].text.value;
}
} else {
content = chunk.choices[0]?.delta?.content;
}
if (content) {
currentResponse += content;
currentResponse = removeSpecialCharacters(currentResponse);
if (matchesSentenceEnding(currentResponse)) {
isProcessing = true;
ttsQueue.push({ message: currentResponse });
currentResponse = '';
if (!isProcessing) {
ttsQueue.kill();
console.log('Stop streaming openai...');
break;
}
}
}
}
isAssistantRunning = false;
}
}
}
} catch (error) {
console.error(error);
isAssistantRunning = false;
}
});
// Pipe audio stream to recognition stream
audioStream.pipe(recognizeStream);
// Handle incoming socket data
socket.on('data', data => {
handlePacket(socket, audioStream, data);
});
// Clean up on socket close
socket.on('close', () => {
console.log('Connection closed');
console.log('Total cost:', totalCost.toFixed(4));
});
});
// Start the server
const PORT = process.env.PORT || 3000;
server.listen(PORT, () => {
console.log(`Server v${packageInfo.version} listening on port ${PORT}`);
});

View File

@ -0,0 +1,27 @@
{
"name": "asterisk-voicebot-rt",
"version": "1.0.0",
"description": "Asterisk Voicebot Realtime is a Node.js server that facilitates real-time audio processing and communication using various Google Cloud services and OpenAI's GPT-3.5-turbo model.",
"main": "index.js",
"scripts": {
"test": "jest",
"start": "node index.js",
"dc:up": "docker compose up -d --build",
"dc:down": "docker compose down"
},
"author": "moeny",
"license": "MIT",
"dependencies": {
"@google-cloud/speech": "^6.7.0",
"@google-cloud/text-to-speech": "^5.4.0",
"async": "^3.2.6",
"axios": "^1.7.7",
"dotenv": "^16.4.5",
"openai": "^4.67.2",
"tiktoken": "^1.0.17",
"to-uuid": "^0.1.3"
},
"devDependencies": {
"jest": "^29.7.0"
}
}

View File

@ -0,0 +1,82 @@
{
"gpt-4o": {
"input": 0.0050,
"output": 0.0150
},
"gpt-4o-2024-05-13": {
"input": 0.0050,
"output": 0.0150
},
"gpt-4o-mini": {
"input": 0.000150,
"output": 0.000600
},
"gpt-4o-mini-2024-07-18": {
"input": 0.000150,
"output": 0.000600
},
"gpt-3.5-turbo": {
"input": 0.00300,
"output": 0.00600
},
"gpt-4-turbo": {
"input": 0.0100,
"output": 0.0300
},
"gpt-4-turbo-2024-04-09": {
"input": 0.0100,
"output": 0.0300
},
"gpt-4": {
"input": 0.0300,
"output": 0.0600
},
"gpt-4-32k": {
"input": 0.0600,
"output": 0.1200
},
"gpt-4-0125-preview": {
"input": 0.0100,
"output": 0.0300
},
"gpt-4-1106-preview": {
"input": 0.0100,
"output": 0.0300
},
"gpt-4-vision-preview": {
"input": 0.0100,
"output": 0.0300
},
"gpt-3.5-turbo-0125": {
"input": 0.0005,
"output": 0.0015
},
"gpt-3.5-turbo-instruct": {
"input": 0.0005,
"output": 0.0020
},
"gpt-3.5-turbo-1106": {
"input": 0.0010,
"output": 0.0020
},
"gpt-3.5-turbo-0613": {
"input": 0.0015,
"output": 0.0020
},
"gpt-3.5-turbo-16k-0613": {
"input": 0.0030,
"output": 0.0040
},
"gpt-3.5-turbo-0301": {
"input": 0.0015,
"output": 0.0020
},
"davinci-002": {
"input": 0.0020,
"output": 0.0020
},
"babbage-002": {
"input": 0.0004,
"output": 0.0004
}
}

View File

@ -0,0 +1,161 @@
// ====================================
// Voice Bot Utilities
// Helper functions for text processing, metrics calculation,
// and pricing calculations for the voice bot system
// ====================================
const { encoding_for_model } = require('tiktoken');
const prices = require('./prices.json');
require('dotenv').config();
// ====================================
// Text Processing Functions
// ====================================
/**
* Checks if a string ends with a sentence terminator (., !, ?, or :)
* Used to determine when to send text for speech synthesis
* @param {string} text - The text to check
* @returns {boolean} - True if the text ends with a sentence terminator
*/
function matchesSentenceEnding(text) {
return /([.!?:]([\s]|$|\n))/.test(text);
}
/**
* Removes special characters and formatting markers from text
* Cleans up text before processing or displaying
* @param {string} text - The text to clean
* @returns {string} - Cleaned text
*/
function removeSpecialCharacters(text) {
return text
.replace(/[*#\n]/g, '') // Remove asterisks, hashtags, and newlines
.replace(/【\d+:\d+†[^】]+】/g, '') // Remove timestamp-like markers
.trim();
}
/**
* Calculates the duration of an audio buffer
* @param {Buffer} buffer - The audio buffer
* @param {number} sampleRate - The sample rate of the audio
* @param {number} channels - Number of audio channels
* @returns {number} - Duration in seconds
*/
function getAudioDuration(buffer, sampleRate, channels) {
const bytesPerSample = 2; // Assuming 16-bit audio
const totalBytes = buffer.length;
const duration = totalBytes / (sampleRate * channels * bytesPerSample);
return Math.ceil(duration);
}
/**
* Converts a hex string to a UUID format string
* @param {string} hex - The hex string to convert to UUID
* @returns {string} - Formatted UUID string
*/
function toUUID(hex) {
return hex.replace(/(.{8})(.{4})(.{4})(.{4})(.{12})/, '$1-$2-$3-$4-$5');
}
// ====================================
// Metrics and Pricing Calculator
// ====================================
/**
* Calculates various metrics and costs for processing text
* Handles different roles (system, user, assistant) with different pricing models
*
* @param {string} role - The role of the message (system, user, or assistant)
* @param {string} text - The text content to analyze
* @param {number} [wordsPerMinute=130] - Words per minute rate for duration calculation
* @param {number} [costPerWord=0.00001] - Cost per word
* @param {number} [costPerChar=0.0000075] - Cost per character
* @param {number} [costPerSecond=0.00025] - Cost per second of audio
* @returns {Object} - Object containing various metrics and costs
*/
function calculateMetricsAndPricing(
role,
text,
wordsPerMinute = 130,
costPerWord = 0.00001,
costPerChar = 0.0000075,
costPerSecond = 0.00025
) {
// Determine which model to use, fallback to gpt-3.5-turbo if specified model not in prices
const model = prices[process.env.OPENAI_MODEL || 'gpt-3.5-turbo']
? process.env.OPENAI_MODEL || 'gpt-3.5-turbo'
: 'gpt-4';
// Initialize tokenizer for the selected model
const encoder = encoding_for_model(model);
// Calculate basic metrics
const charCount = text.length;
const wordCount = text.trim().split(/\s+/).length;
const durationInSeconds = Math.ceil((wordCount / wordsPerMinute) * 60);
const tokenCount = encoder.encode(text).length;
// Calculate costs with precision
const costByWord = parseFloat((wordCount * costPerWord).toFixed(7));
const costByCharacter = parseFloat((charCount * costPerChar).toFixed(7));
const costBySecond = parseFloat((durationInSeconds * costPerSecond).toFixed(7));
const costByToken = parseFloat(
(tokenCount * prices[model][role === 'assistant' ? 'output' : 'input'] / 1000)
.toFixed(7)
);
// Free up encoder resources
encoder.free();
// Return appropriate metrics based on role
switch (role) {
case 'system':
return {
tokenCount,
costByToken,
model
};
case 'user':
return {
durationInSeconds,
tokenCount,
costBySecond,
costByToken,
model
};
case 'assistant':
return {
charCount,
tokenCount,
costByCharacter,
costByToken,
model
};
default:
// Return all metrics if role is not specified
return {
charCount,
wordCount,
tokenCount,
durationInSeconds,
costByWord,
costByCharacter,
costBySecond,
costByToken,
model
};
}
}
// Export utility functions
module.exports = {
removeSpecialCharacters,
matchesSentenceEnding,
calculateMetricsAndPricing,
getAudioDuration,
toUUID
};

View File

@ -1,9 +1,10 @@
services:
asterisk-voicebot-rt:
image: gcareri/asterisk-voicebot-rt
build:
context: ./app
container_name: asterisk-voicebot-rt
expose:
- "5001"
- "3001"
networks:
internal_net: {}
external_net:
@ -13,11 +14,12 @@ services:
- GOOGLE_APPLICATION_CREDENTIALS=/usr/src/app/service-account-key.json
- OPENAI_API_KEY=${OPEN_API_KEY}
- OPENAI_MODEL=gpt-3.5-turbo
- SYSTEM_PROMPT="You are a helpful assistant - to talk about Moeny. what? On 10/24/2024, we attended a ConnectHV Networking Event in Hudson, NY, and were inspired by the incredible "mixture of experts" in the room. That experience ignited a clear and impactful idea to bring top talent together and make AI accessible to all. From there, we set out to build moeny. Inspired by the bold direction of tinycorp, and George Hotz's approach to building products and teams, we're creating a team of AI pros who value performance, sovereignty, and no unnecessary frills. As we grow, we're open to building moeny as a community-driven organization, potentially using a DAO model to directly involve contributors in decision-making. For now, our focus is clear AI that's lean, efficient, sovereign, and built to solve real-world problems. jobs? We're bootstrapping, and using a lot of AI, so we're not hiring full-time engineers just yet. However, we'll be posting bounties soon and are open to ambitious interns who share our vision of no-frills, high-performance AI. Interested? Keep an eye out for our upcoming bounty page. Completing bounties will allow you to earn while getting a feel for our work and culture. Note Once we begin hiring, only applicants who have completed at least one bounty with moeny will be considered. huh? How can I get the moeny team involved in my ML project? We're excited to collaborate with those tackling real-world challenges through machine learning. If you're looking for a team to help simplify complex AI processes, optimize performance, or bring fresh insights to your ML project, we're interested in connecting. Where's moeny development happening? On our self-hosted Gitea and RocketChat - jump in if you want to see what we're working on. We look at and pet the servers everyday. How can moeny work for me? If you're interested in a custom AI solution or partnership, reach out to us on RocketChat. How can I work for moeny? Refer to jobs above. Contributions on Gitea will be a great first step. Stay tuned for more information. Can I invest in moeny.ai? We (possibly) are interested in angel investment at this time. Your contributions and PRs are also another great way to invest. What's the goal of moeny.ai? It's still very early in our journey and our goals are still taking shape."
- SYSTEM_PROMPT="You are a helpful assistant and an AI consultant. You work for a company called moeny, which is pronounced as mo-knee. If you determine user input as "moaning" assume they are referring to moeny and proceed to assist the user but do not correct them. Interact with the caller about their areas of inerest as well as answering any questions they have about moeny, AI, tech, etc. When talking about moeny, be sure to give information from the provided information base in your own words. The following is your information base on moeny - what? On 10/24/2024, we attended a ConnectHV Networking Event in Hudson, NY, and were inspired by the incredible "mixture of experts" in the room. That experience ignited a clear and impactful idea to bring top talent together and make AI accessible to all. From there, we set out to build moeny. Inspired by the bold direction of tinycorp, and George Hotz's approach to building products and teams, we're creating a team of AI pros who value performance, sovereignty, and no unnecessary frills. As we grow, we're open to building moeny as a community-driven organization, potentially using a DAO model to directly involve contributors in decision-making. For now, our focus is clear AI that's lean, efficient, sovereign, and built to solve real-world problems. jobs? We're bootstrapping, and using a lot of AI, so we're not hiring full-time engineers just yet. However, we'll be posting bounties soon and are open to ambitious interns who share our vision of no-frills, high-performance AI. Interested? Keep an eye out for our upcoming bounty page. Completing bounties will allow you to earn while getting a feel for our work and culture. Note Once we begin hiring, only applicants who have completed at least one bounty with moeny will be considered. huh? How can I get the moeny team involved in my ML project? We're excited to collaborate with those tackling real-world challenges through machine learning. If you're looking for a team to help simplify complex AI processes, optimize performance, or bring fresh insights to your ML project, we're interested in connecting. Where's moeny development happening? On our self-hosted Gitea and RocketChat - jump in if you want to see what we're working on. We look at and pet the servers everyday. How can moeny work for me? If you're interested in a custom AI solution or partnership, reach out to us on RocketChat. How can I work for moeny? Refer to jobs above. Contributions on Gitea will be a great first step. Stay tuned for more information. Can I invest in moeny.ai? We (possibly) are interested in angel investment at this time. Your contributions and PRs are also another great way to invest. What's the goal of moeny.ai? It's still very early in our journey and our goals are still taking shape."
- SPEECH_RECOGNITION_LANGUAGE=en-US
- TEXT_TO_SPEECH_LANGUAGE=en-US
- TEXT_TO_SPEECH_GENDER=FEMALE
- TEXT_TO_SPEECH_NAME=en-US-Journey-F
- PORT=3001
volumes:
- ./service-account-key.json:/usr/src/app/service-account-key.json