Add speech-to-text functionality using OpenAI

This commit introduces a new feature that transcribes audio messages to text using OpenAI's Whisper model. The following files were modified to implement this feature:

- `CHANGELOG.md`: Added a new entry under the 'Features' section to document the speech-to-text functionality.
- `prisma/postgresql-schema.prisma`: Added a new boolean field `speechToText` to the `OpenaiSetting` model.
- `src/api/integrations/openai/dto/openai.dto.ts`: Added a new optional boolean property `speechToText` to the `OpenaiSettingDto` class.
- `src/api/integrations/openai/services/openai.service.ts`: Implemented the `speechToText` method to handle the transcription process.
- `src/api/integrations/openai/validate/openai.schema.ts`: Added a new required boolean schema for the `speechToText` property in the `openaiSettingSchema`.
- `src/api/integrations/typebot/services/typebot.service.ts`: Updated the `audioMessage` property to consider the new `speechToText` field.
- `src/api/services/channels/whatsapp.baileys.service.ts` and `src/api/services/channels/whatsapp.business.service.ts`: Added logic to handle the transcription of audio messages when the `speechToText` setting is enabled.

The purpose of this change is to provide a more accessible way for users to interact with audio messages by converting them to text. This improvement will be particularly useful for users with hearing impairments or those in noisy environments.
This commit is contained in:
Davidson Gomes 2024-08-01 17:15:20 -03:00
parent a73b74ceaa
commit 67409e1bf5
10 changed files with 141 additions and 6 deletions

View File

@ -1,5 +1,9 @@
# 2.0.5-rc (release candidate)
### Features
* Speech to Text with Openai
### Fixed
* ClientName on infos

View File

@ -0,0 +1,2 @@
-- AlterTable
ALTER TABLE "OpenaiSetting" ADD COLUMN "speechToText" BOOLEAN DEFAULT false;

View File

@ -422,6 +422,7 @@ model OpenaiSetting {
keepOpen Boolean? @default(false) @db.Boolean
debounceTime Int? @db.Integer
ignoreJids Json?
speechToText Boolean? @default(false) @db.Boolean
createdAt DateTime? @default(now()) @db.Timestamp
updatedAt DateTime @updatedAt @db.Timestamp
OpenaiCreds OpenaiCreds? @relation(fields: [openaiCredsId], references: [id])

View File

@ -670,7 +670,11 @@ export class DifyService {
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
// Medias
audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined,
audioMessage: msg?.message?.speechToText
? msg?.message?.speechToText
: msg?.message?.audioMessage
? `audioMessage|${mediaId}`
: undefined,
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,

View File

@ -49,6 +49,7 @@ export class OpenaiSettingDto {
debounceTime?: number;
openaiIdFallback?: string;
ignoreJids?: any;
speechToText?: boolean;
}
export class OpenaiIgnoreJidDto {

View File

@ -1,7 +1,11 @@
import { Message, OpenaiBot, OpenaiSession, OpenaiSetting } from '@prisma/client';
import { Message, OpenaiBot, OpenaiCreds, OpenaiSession, OpenaiSetting } from '@prisma/client';
import axios from 'axios';
import { downloadMediaMessage } from 'baileys';
import FormData from 'form-data';
import OpenAI from 'openai';
import P from 'pino';
import { ConfigService, S3 } from '../../../../config/env.config';
import { ConfigService, Language, S3 } from '../../../../config/env.config';
import { Logger } from '../../../../config/logger.config';
import { sendTelemetry } from '../../../../utils/sendTelemetry';
import { InstanceDto } from '../../../dto/instance.dto';
@ -528,6 +532,7 @@ export class OpenaiService {
stopBotFromMe: data.stopBotFromMe,
keepOpen: data.keepOpen,
debounceTime: data.debounceTime,
speechToText: data.speechToText,
openaiIdFallback: data.openaiIdFallback,
ignoreJids: data.ignoreJids,
},
@ -543,6 +548,7 @@ export class OpenaiService {
stopBotFromMe: updateSettings.stopBotFromMe,
keepOpen: updateSettings.keepOpen,
debounceTime: updateSettings.debounceTime,
speechToText: updateSettings.speechToText,
openaiIdFallback: updateSettings.openaiIdFallback,
ignoreJids: updateSettings.ignoreJids,
};
@ -561,6 +567,7 @@ export class OpenaiService {
debounceTime: data.debounceTime,
openaiIdFallback: data.openaiIdFallback,
ignoreJids: data.ignoreJids,
speechToText: data.speechToText,
instanceId: instanceId,
},
});
@ -615,6 +622,7 @@ export class OpenaiService {
keepOpen: false,
ignoreJids: [],
openaiIdFallback: null,
speechToText: false,
fallback: null,
};
}
@ -630,6 +638,7 @@ export class OpenaiService {
keepOpen: settings.keepOpen,
ignoreJids: settings.ignoreJids,
openaiIdFallback: settings.openaiIdFallback,
speechToText: settings.speechToText,
fallback: settings.Fallback,
};
} catch (error) {
@ -823,7 +832,11 @@ export class OpenaiService {
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
// Medias
audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined,
audioMessage: msg?.message?.speechToText
? msg?.message?.speechToText
: msg?.message?.audioMessage
? `audioMessage|${mediaId}`
: undefined,
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,
@ -1779,4 +1792,43 @@ export class OpenaiService {
return;
}
public async speechToText(creds: OpenaiCreds, msg: any, updateMediaMessage: any) {
let audio;
if (msg?.message?.mediaUrl) {
audio = await axios.get(msg.message.mediaUrl, { responseType: 'arraybuffer' }).then((response) => {
return Buffer.from(response.data, 'binary');
});
} else {
audio = await downloadMediaMessage(
{ key: msg.key, message: msg?.message },
'buffer',
{},
{
logger: P({ level: 'error' }) as any,
reuploadRequest: updateMediaMessage,
},
);
}
const lang = this.configService.get<Language>('LANGUAGE').includes('pt')
? 'pt'
: this.configService.get<Language>('LANGUAGE');
const formData = new FormData();
formData.append('file', audio, 'audio.ogg');
formData.append('model', 'whisper-1');
formData.append('language', lang);
const response = await axios.post('https://api.openai.com/v1/audio/transcriptions', formData, {
headers: {
'Content-Type': 'multipart/form-data',
Authorization: `Bearer ${creds.apiKey}`,
},
});
return response?.data?.text;
}
}

View File

@ -85,6 +85,7 @@ export const openaiSettingSchema: JSONSchema7 = {
stopBotFromMe: { type: 'boolean' },
keepOpen: { type: 'boolean' },
debounceTime: { type: 'integer' },
speechToText: { type: 'boolean' },
ignoreJids: { type: 'array', items: { type: 'string' } },
openaiIdFallback: { type: 'string' },
},

View File

@ -931,7 +931,11 @@ export class TypebotService {
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
// Medias
audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined,
audioMessage: msg?.message?.speechToText
? msg?.message?.speechToText
: msg?.message?.audioMessage
? `audioMessage|${mediaId}`
: undefined,
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,

View File

@ -1161,6 +1161,30 @@ export class BaileysStartupService extends ChannelStartupService {
messageRaw.message.base64 = buffer ? buffer.toString('base64') : undefined;
}
if (this.configService.get<Openai>('OPENAI').ENABLED) {
const openAiDefaultSettings = await this.prismaRepository.openaiSetting.findFirst({
where: {
instanceId: this.instanceId,
},
include: {
OpenaiCreds: true,
},
});
if (
openAiDefaultSettings &&
openAiDefaultSettings.openaiCredsId &&
openAiDefaultSettings.speechToText &&
received?.message?.audioMessage
) {
messageRaw.message.speechToText = await this.openaiService.speechToText(
openAiDefaultSettings.OpenaiCreds,
received,
this.client.updateMediaMessage,
);
}
}
this.logger.log(messageRaw);
this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw);

View File

@ -5,7 +5,7 @@ import FormData from 'form-data';
import { createReadStream } from 'fs';
import { getMIMEType } from 'node-mime-types';
import { Chatwoot, ConfigService, Database, Typebot, WaBusiness } from '../../../config/env.config';
import { Chatwoot, ConfigService, Database, Dify, Openai, Typebot, WaBusiness } from '../../../config/env.config';
import { BadRequestException, InternalServerErrorException } from '../../../exceptions';
import { NumberBusiness } from '../../dto/chat.dto';
import {
@ -403,6 +403,30 @@ export class BusinessStartupService extends ChannelStartupService {
// await this.client.readMessages([received.key]);
}
if (this.configService.get<Openai>('OPENAI').ENABLED) {
const openAiDefaultSettings = await this.prismaRepository.openaiSetting.findFirst({
where: {
instanceId: this.instanceId,
},
include: {
OpenaiCreds: true,
},
});
if (
openAiDefaultSettings &&
openAiDefaultSettings.openaiCredsId &&
openAiDefaultSettings.speechToText &&
received?.message?.audioMessage
) {
messageRaw.message.speechToText = await this.openaiService.speechToText(
openAiDefaultSettings.OpenaiCreds,
received,
this.client.updateMediaMessage,
);
}
}
this.logger.log(messageRaw);
this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw);
@ -430,6 +454,24 @@ export class BusinessStartupService extends ChannelStartupService {
);
}
if (this.configService.get<Openai>('OPENAI').ENABLED) {
if (messageRaw.messageType !== 'reactionMessage')
await this.openaiService.sendOpenai(
{ instanceName: this.instance.name, instanceId: this.instanceId },
messageRaw.key.remoteJid,
messageRaw,
);
}
if (this.configService.get<Dify>('DIFY').ENABLED) {
if (messageRaw.messageType !== 'reactionMessage')
await this.difyService.sendDify(
{ instanceName: this.instance.name, instanceId: this.instanceId },
messageRaw.key.remoteJid,
messageRaw,
);
}
await this.prismaRepository.message.create({
data: messageRaw,
});