From 67409e1bf52d6fd46e70b58bc7e80f8323e16de6 Mon Sep 17 00:00:00 2001 From: Davidson Gomes Date: Thu, 1 Aug 2024 17:15:20 -0300 Subject: [PATCH] Add speech-to-text functionality using OpenAI This commit introduces a new feature that transcribes audio messages to text using OpenAI's Whisper model. The following files were modified to implement this feature: - `CHANGELOG.md`: Added a new entry under the 'Features' section to document the speech-to-text functionality. - `prisma/postgresql-schema.prisma`: Added a new boolean field `speechToText` to the `OpenaiSetting` model. - `src/api/integrations/openai/dto/openai.dto.ts`: Added a new optional boolean property `speechToText` to the `OpenaiSettingDto` class. - `src/api/integrations/openai/services/openai.service.ts`: Implemented the `speechToText` method to handle the transcription process. - `src/api/integrations/openai/validate/openai.schema.ts`: Added a new required boolean schema for the `speechToText` property in the `openaiSettingSchema`. - `src/api/integrations/typebot/services/typebot.service.ts`: Updated the `audioMessage` property to consider the new `speechToText` field. - `src/api/services/channels/whatsapp.baileys.service.ts` and `src/api/services/channels/whatsapp.business.service.ts`: Added logic to handle the transcription of audio messages when the `speechToText` setting is enabled. The purpose of this change is to provide a more accessible way for users to interact with audio messages by converting them to text. This improvement will be particularly useful for users with hearing impairments or those in noisy environments. --- CHANGELOG.md | 4 ++ .../migration.sql | 2 + prisma/postgresql-schema.prisma | 1 + .../dify/services/dify.service.ts | 6 +- src/api/integrations/openai/dto/openai.dto.ts | 1 + .../openai/services/openai.service.ts | 58 ++++++++++++++++++- .../openai/validate/openai.schema.ts | 1 + .../typebot/services/typebot.service.ts | 6 +- .../channels/whatsapp.baileys.service.ts | 24 ++++++++ .../channels/whatsapp.business.service.ts | 44 +++++++++++++- 10 files changed, 141 insertions(+), 6 deletions(-) create mode 100644 prisma/migrations/20240801193907_add_column_speech_to_text_openai_setting_table/migration.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cf3b9c9..c07736f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # 2.0.5-rc (release candidate) +### Features + +* Speech to Text with Openai + ### Fixed * ClientName on infos diff --git a/prisma/migrations/20240801193907_add_column_speech_to_text_openai_setting_table/migration.sql b/prisma/migrations/20240801193907_add_column_speech_to_text_openai_setting_table/migration.sql new file mode 100644 index 00000000..1f8e1fe5 --- /dev/null +++ b/prisma/migrations/20240801193907_add_column_speech_to_text_openai_setting_table/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "OpenaiSetting" ADD COLUMN "speechToText" BOOLEAN DEFAULT false; diff --git a/prisma/postgresql-schema.prisma b/prisma/postgresql-schema.prisma index d077530a..686c560f 100644 --- a/prisma/postgresql-schema.prisma +++ b/prisma/postgresql-schema.prisma @@ -422,6 +422,7 @@ model OpenaiSetting { keepOpen Boolean? @default(false) @db.Boolean debounceTime Int? @db.Integer ignoreJids Json? + speechToText Boolean? @default(false) @db.Boolean createdAt DateTime? @default(now()) @db.Timestamp updatedAt DateTime @updatedAt @db.Timestamp OpenaiCreds OpenaiCreds? @relation(fields: [openaiCredsId], references: [id]) diff --git a/src/api/integrations/dify/services/dify.service.ts b/src/api/integrations/dify/services/dify.service.ts index 2c3a442d..b2131c92 100644 --- a/src/api/integrations/dify/services/dify.service.ts +++ b/src/api/integrations/dify/services/dify.service.ts @@ -670,7 +670,11 @@ export class DifyService { listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, // Medias - audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined, + audioMessage: msg?.message?.speechToText + ? msg?.message?.speechToText + : msg?.message?.audioMessage + ? `audioMessage|${mediaId}` + : undefined, imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined, videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined, documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined, diff --git a/src/api/integrations/openai/dto/openai.dto.ts b/src/api/integrations/openai/dto/openai.dto.ts index c4b7ebef..89b14230 100644 --- a/src/api/integrations/openai/dto/openai.dto.ts +++ b/src/api/integrations/openai/dto/openai.dto.ts @@ -49,6 +49,7 @@ export class OpenaiSettingDto { debounceTime?: number; openaiIdFallback?: string; ignoreJids?: any; + speechToText?: boolean; } export class OpenaiIgnoreJidDto { diff --git a/src/api/integrations/openai/services/openai.service.ts b/src/api/integrations/openai/services/openai.service.ts index 0f2e24fe..523a9784 100644 --- a/src/api/integrations/openai/services/openai.service.ts +++ b/src/api/integrations/openai/services/openai.service.ts @@ -1,7 +1,11 @@ -import { Message, OpenaiBot, OpenaiSession, OpenaiSetting } from '@prisma/client'; +import { Message, OpenaiBot, OpenaiCreds, OpenaiSession, OpenaiSetting } from '@prisma/client'; +import axios from 'axios'; +import { downloadMediaMessage } from 'baileys'; +import FormData from 'form-data'; import OpenAI from 'openai'; +import P from 'pino'; -import { ConfigService, S3 } from '../../../../config/env.config'; +import { ConfigService, Language, S3 } from '../../../../config/env.config'; import { Logger } from '../../../../config/logger.config'; import { sendTelemetry } from '../../../../utils/sendTelemetry'; import { InstanceDto } from '../../../dto/instance.dto'; @@ -528,6 +532,7 @@ export class OpenaiService { stopBotFromMe: data.stopBotFromMe, keepOpen: data.keepOpen, debounceTime: data.debounceTime, + speechToText: data.speechToText, openaiIdFallback: data.openaiIdFallback, ignoreJids: data.ignoreJids, }, @@ -543,6 +548,7 @@ export class OpenaiService { stopBotFromMe: updateSettings.stopBotFromMe, keepOpen: updateSettings.keepOpen, debounceTime: updateSettings.debounceTime, + speechToText: updateSettings.speechToText, openaiIdFallback: updateSettings.openaiIdFallback, ignoreJids: updateSettings.ignoreJids, }; @@ -561,6 +567,7 @@ export class OpenaiService { debounceTime: data.debounceTime, openaiIdFallback: data.openaiIdFallback, ignoreJids: data.ignoreJids, + speechToText: data.speechToText, instanceId: instanceId, }, }); @@ -615,6 +622,7 @@ export class OpenaiService { keepOpen: false, ignoreJids: [], openaiIdFallback: null, + speechToText: false, fallback: null, }; } @@ -630,6 +638,7 @@ export class OpenaiService { keepOpen: settings.keepOpen, ignoreJids: settings.ignoreJids, openaiIdFallback: settings.openaiIdFallback, + speechToText: settings.speechToText, fallback: settings.Fallback, }; } catch (error) { @@ -823,7 +832,11 @@ export class OpenaiService { listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, // Medias - audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined, + audioMessage: msg?.message?.speechToText + ? msg?.message?.speechToText + : msg?.message?.audioMessage + ? `audioMessage|${mediaId}` + : undefined, imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined, videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined, documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined, @@ -1779,4 +1792,43 @@ export class OpenaiService { return; } + + public async speechToText(creds: OpenaiCreds, msg: any, updateMediaMessage: any) { + let audio; + + if (msg?.message?.mediaUrl) { + audio = await axios.get(msg.message.mediaUrl, { responseType: 'arraybuffer' }).then((response) => { + return Buffer.from(response.data, 'binary'); + }); + } else { + audio = await downloadMediaMessage( + { key: msg.key, message: msg?.message }, + 'buffer', + {}, + { + logger: P({ level: 'error' }) as any, + reuploadRequest: updateMediaMessage, + }, + ); + } + + const lang = this.configService.get('LANGUAGE').includes('pt') + ? 'pt' + : this.configService.get('LANGUAGE'); + + const formData = new FormData(); + + formData.append('file', audio, 'audio.ogg'); + formData.append('model', 'whisper-1'); + formData.append('language', lang); + + const response = await axios.post('https://api.openai.com/v1/audio/transcriptions', formData, { + headers: { + 'Content-Type': 'multipart/form-data', + Authorization: `Bearer ${creds.apiKey}`, + }, + }); + + return response?.data?.text; + } } diff --git a/src/api/integrations/openai/validate/openai.schema.ts b/src/api/integrations/openai/validate/openai.schema.ts index 7131e0dd..0aaa054d 100644 --- a/src/api/integrations/openai/validate/openai.schema.ts +++ b/src/api/integrations/openai/validate/openai.schema.ts @@ -85,6 +85,7 @@ export const openaiSettingSchema: JSONSchema7 = { stopBotFromMe: { type: 'boolean' }, keepOpen: { type: 'boolean' }, debounceTime: { type: 'integer' }, + speechToText: { type: 'boolean' }, ignoreJids: { type: 'array', items: { type: 'string' } }, openaiIdFallback: { type: 'string' }, }, diff --git a/src/api/integrations/typebot/services/typebot.service.ts b/src/api/integrations/typebot/services/typebot.service.ts index 1f3476d8..75b8330b 100644 --- a/src/api/integrations/typebot/services/typebot.service.ts +++ b/src/api/integrations/typebot/services/typebot.service.ts @@ -931,7 +931,11 @@ export class TypebotService { listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, // Medias - audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined, + audioMessage: msg?.message?.speechToText + ? msg?.message?.speechToText + : msg?.message?.audioMessage + ? `audioMessage|${mediaId}` + : undefined, imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined, videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined, documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined, diff --git a/src/api/services/channels/whatsapp.baileys.service.ts b/src/api/services/channels/whatsapp.baileys.service.ts index 1a7194b0..0279043f 100644 --- a/src/api/services/channels/whatsapp.baileys.service.ts +++ b/src/api/services/channels/whatsapp.baileys.service.ts @@ -1161,6 +1161,30 @@ export class BaileysStartupService extends ChannelStartupService { messageRaw.message.base64 = buffer ? buffer.toString('base64') : undefined; } + if (this.configService.get('OPENAI').ENABLED) { + const openAiDefaultSettings = await this.prismaRepository.openaiSetting.findFirst({ + where: { + instanceId: this.instanceId, + }, + include: { + OpenaiCreds: true, + }, + }); + + if ( + openAiDefaultSettings && + openAiDefaultSettings.openaiCredsId && + openAiDefaultSettings.speechToText && + received?.message?.audioMessage + ) { + messageRaw.message.speechToText = await this.openaiService.speechToText( + openAiDefaultSettings.OpenaiCreds, + received, + this.client.updateMediaMessage, + ); + } + } + this.logger.log(messageRaw); this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw); diff --git a/src/api/services/channels/whatsapp.business.service.ts b/src/api/services/channels/whatsapp.business.service.ts index d1f36e71..87c9e138 100644 --- a/src/api/services/channels/whatsapp.business.service.ts +++ b/src/api/services/channels/whatsapp.business.service.ts @@ -5,7 +5,7 @@ import FormData from 'form-data'; import { createReadStream } from 'fs'; import { getMIMEType } from 'node-mime-types'; -import { Chatwoot, ConfigService, Database, Typebot, WaBusiness } from '../../../config/env.config'; +import { Chatwoot, ConfigService, Database, Dify, Openai, Typebot, WaBusiness } from '../../../config/env.config'; import { BadRequestException, InternalServerErrorException } from '../../../exceptions'; import { NumberBusiness } from '../../dto/chat.dto'; import { @@ -403,6 +403,30 @@ export class BusinessStartupService extends ChannelStartupService { // await this.client.readMessages([received.key]); } + if (this.configService.get('OPENAI').ENABLED) { + const openAiDefaultSettings = await this.prismaRepository.openaiSetting.findFirst({ + where: { + instanceId: this.instanceId, + }, + include: { + OpenaiCreds: true, + }, + }); + + if ( + openAiDefaultSettings && + openAiDefaultSettings.openaiCredsId && + openAiDefaultSettings.speechToText && + received?.message?.audioMessage + ) { + messageRaw.message.speechToText = await this.openaiService.speechToText( + openAiDefaultSettings.OpenaiCreds, + received, + this.client.updateMediaMessage, + ); + } + } + this.logger.log(messageRaw); this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw); @@ -430,6 +454,24 @@ export class BusinessStartupService extends ChannelStartupService { ); } + if (this.configService.get('OPENAI').ENABLED) { + if (messageRaw.messageType !== 'reactionMessage') + await this.openaiService.sendOpenai( + { instanceName: this.instance.name, instanceId: this.instanceId }, + messageRaw.key.remoteJid, + messageRaw, + ); + } + + if (this.configService.get('DIFY').ENABLED) { + if (messageRaw.messageType !== 'reactionMessage') + await this.difyService.sendDify( + { instanceName: this.instance.name, instanceId: this.instanceId }, + messageRaw.key.remoteJid, + messageRaw, + ); + } + await this.prismaRepository.message.create({ data: messageRaw, });