diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cf3b9c9..c07736f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # 2.0.5-rc (release candidate) +### Features + +* Speech to Text with Openai + ### Fixed * ClientName on infos diff --git a/prisma/migrations/20240801193907_add_column_speech_to_text_openai_setting_table/migration.sql b/prisma/migrations/20240801193907_add_column_speech_to_text_openai_setting_table/migration.sql new file mode 100644 index 00000000..1f8e1fe5 --- /dev/null +++ b/prisma/migrations/20240801193907_add_column_speech_to_text_openai_setting_table/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "OpenaiSetting" ADD COLUMN "speechToText" BOOLEAN DEFAULT false; diff --git a/prisma/postgresql-schema.prisma b/prisma/postgresql-schema.prisma index d077530a..686c560f 100644 --- a/prisma/postgresql-schema.prisma +++ b/prisma/postgresql-schema.prisma @@ -422,6 +422,7 @@ model OpenaiSetting { keepOpen Boolean? @default(false) @db.Boolean debounceTime Int? @db.Integer ignoreJids Json? + speechToText Boolean? @default(false) @db.Boolean createdAt DateTime? @default(now()) @db.Timestamp updatedAt DateTime @updatedAt @db.Timestamp OpenaiCreds OpenaiCreds? @relation(fields: [openaiCredsId], references: [id]) diff --git a/src/api/integrations/dify/services/dify.service.ts b/src/api/integrations/dify/services/dify.service.ts index 2c3a442d..b2131c92 100644 --- a/src/api/integrations/dify/services/dify.service.ts +++ b/src/api/integrations/dify/services/dify.service.ts @@ -670,7 +670,11 @@ export class DifyService { listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, // Medias - audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined, + audioMessage: msg?.message?.speechToText + ? msg?.message?.speechToText + : msg?.message?.audioMessage + ? `audioMessage|${mediaId}` + : undefined, imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined, videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined, documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined, diff --git a/src/api/integrations/openai/dto/openai.dto.ts b/src/api/integrations/openai/dto/openai.dto.ts index c4b7ebef..89b14230 100644 --- a/src/api/integrations/openai/dto/openai.dto.ts +++ b/src/api/integrations/openai/dto/openai.dto.ts @@ -49,6 +49,7 @@ export class OpenaiSettingDto { debounceTime?: number; openaiIdFallback?: string; ignoreJids?: any; + speechToText?: boolean; } export class OpenaiIgnoreJidDto { diff --git a/src/api/integrations/openai/services/openai.service.ts b/src/api/integrations/openai/services/openai.service.ts index 0f2e24fe..523a9784 100644 --- a/src/api/integrations/openai/services/openai.service.ts +++ b/src/api/integrations/openai/services/openai.service.ts @@ -1,7 +1,11 @@ -import { Message, OpenaiBot, OpenaiSession, OpenaiSetting } from '@prisma/client'; +import { Message, OpenaiBot, OpenaiCreds, OpenaiSession, OpenaiSetting } from '@prisma/client'; +import axios from 'axios'; +import { downloadMediaMessage } from 'baileys'; +import FormData from 'form-data'; import OpenAI from 'openai'; +import P from 'pino'; -import { ConfigService, S3 } from '../../../../config/env.config'; +import { ConfigService, Language, S3 } from '../../../../config/env.config'; import { Logger } from '../../../../config/logger.config'; import { sendTelemetry } from '../../../../utils/sendTelemetry'; import { InstanceDto } from '../../../dto/instance.dto'; @@ -528,6 +532,7 @@ export class OpenaiService { stopBotFromMe: data.stopBotFromMe, keepOpen: data.keepOpen, debounceTime: data.debounceTime, + speechToText: data.speechToText, openaiIdFallback: data.openaiIdFallback, ignoreJids: data.ignoreJids, }, @@ -543,6 +548,7 @@ export class OpenaiService { stopBotFromMe: updateSettings.stopBotFromMe, keepOpen: updateSettings.keepOpen, debounceTime: updateSettings.debounceTime, + speechToText: updateSettings.speechToText, openaiIdFallback: updateSettings.openaiIdFallback, ignoreJids: updateSettings.ignoreJids, }; @@ -561,6 +567,7 @@ export class OpenaiService { debounceTime: data.debounceTime, openaiIdFallback: data.openaiIdFallback, ignoreJids: data.ignoreJids, + speechToText: data.speechToText, instanceId: instanceId, }, }); @@ -615,6 +622,7 @@ export class OpenaiService { keepOpen: false, ignoreJids: [], openaiIdFallback: null, + speechToText: false, fallback: null, }; } @@ -630,6 +638,7 @@ export class OpenaiService { keepOpen: settings.keepOpen, ignoreJids: settings.ignoreJids, openaiIdFallback: settings.openaiIdFallback, + speechToText: settings.speechToText, fallback: settings.Fallback, }; } catch (error) { @@ -823,7 +832,11 @@ export class OpenaiService { listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, // Medias - audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined, + audioMessage: msg?.message?.speechToText + ? msg?.message?.speechToText + : msg?.message?.audioMessage + ? `audioMessage|${mediaId}` + : undefined, imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined, videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined, documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined, @@ -1779,4 +1792,43 @@ export class OpenaiService { return; } + + public async speechToText(creds: OpenaiCreds, msg: any, updateMediaMessage: any) { + let audio; + + if (msg?.message?.mediaUrl) { + audio = await axios.get(msg.message.mediaUrl, { responseType: 'arraybuffer' }).then((response) => { + return Buffer.from(response.data, 'binary'); + }); + } else { + audio = await downloadMediaMessage( + { key: msg.key, message: msg?.message }, + 'buffer', + {}, + { + logger: P({ level: 'error' }) as any, + reuploadRequest: updateMediaMessage, + }, + ); + } + + const lang = this.configService.get('LANGUAGE').includes('pt') + ? 'pt' + : this.configService.get('LANGUAGE'); + + const formData = new FormData(); + + formData.append('file', audio, 'audio.ogg'); + formData.append('model', 'whisper-1'); + formData.append('language', lang); + + const response = await axios.post('https://api.openai.com/v1/audio/transcriptions', formData, { + headers: { + 'Content-Type': 'multipart/form-data', + Authorization: `Bearer ${creds.apiKey}`, + }, + }); + + return response?.data?.text; + } } diff --git a/src/api/integrations/openai/validate/openai.schema.ts b/src/api/integrations/openai/validate/openai.schema.ts index 7131e0dd..0aaa054d 100644 --- a/src/api/integrations/openai/validate/openai.schema.ts +++ b/src/api/integrations/openai/validate/openai.schema.ts @@ -85,6 +85,7 @@ export const openaiSettingSchema: JSONSchema7 = { stopBotFromMe: { type: 'boolean' }, keepOpen: { type: 'boolean' }, debounceTime: { type: 'integer' }, + speechToText: { type: 'boolean' }, ignoreJids: { type: 'array', items: { type: 'string' } }, openaiIdFallback: { type: 'string' }, }, diff --git a/src/api/integrations/typebot/services/typebot.service.ts b/src/api/integrations/typebot/services/typebot.service.ts index 1f3476d8..75b8330b 100644 --- a/src/api/integrations/typebot/services/typebot.service.ts +++ b/src/api/integrations/typebot/services/typebot.service.ts @@ -931,7 +931,11 @@ export class TypebotService { listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId, // Medias - audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined, + audioMessage: msg?.message?.speechToText + ? msg?.message?.speechToText + : msg?.message?.audioMessage + ? `audioMessage|${mediaId}` + : undefined, imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined, videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined, documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined, diff --git a/src/api/services/channels/whatsapp.baileys.service.ts b/src/api/services/channels/whatsapp.baileys.service.ts index 1a7194b0..0279043f 100644 --- a/src/api/services/channels/whatsapp.baileys.service.ts +++ b/src/api/services/channels/whatsapp.baileys.service.ts @@ -1161,6 +1161,30 @@ export class BaileysStartupService extends ChannelStartupService { messageRaw.message.base64 = buffer ? buffer.toString('base64') : undefined; } + if (this.configService.get('OPENAI').ENABLED) { + const openAiDefaultSettings = await this.prismaRepository.openaiSetting.findFirst({ + where: { + instanceId: this.instanceId, + }, + include: { + OpenaiCreds: true, + }, + }); + + if ( + openAiDefaultSettings && + openAiDefaultSettings.openaiCredsId && + openAiDefaultSettings.speechToText && + received?.message?.audioMessage + ) { + messageRaw.message.speechToText = await this.openaiService.speechToText( + openAiDefaultSettings.OpenaiCreds, + received, + this.client.updateMediaMessage, + ); + } + } + this.logger.log(messageRaw); this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw); diff --git a/src/api/services/channels/whatsapp.business.service.ts b/src/api/services/channels/whatsapp.business.service.ts index d1f36e71..87c9e138 100644 --- a/src/api/services/channels/whatsapp.business.service.ts +++ b/src/api/services/channels/whatsapp.business.service.ts @@ -5,7 +5,7 @@ import FormData from 'form-data'; import { createReadStream } from 'fs'; import { getMIMEType } from 'node-mime-types'; -import { Chatwoot, ConfigService, Database, Typebot, WaBusiness } from '../../../config/env.config'; +import { Chatwoot, ConfigService, Database, Dify, Openai, Typebot, WaBusiness } from '../../../config/env.config'; import { BadRequestException, InternalServerErrorException } from '../../../exceptions'; import { NumberBusiness } from '../../dto/chat.dto'; import { @@ -403,6 +403,30 @@ export class BusinessStartupService extends ChannelStartupService { // await this.client.readMessages([received.key]); } + if (this.configService.get('OPENAI').ENABLED) { + const openAiDefaultSettings = await this.prismaRepository.openaiSetting.findFirst({ + where: { + instanceId: this.instanceId, + }, + include: { + OpenaiCreds: true, + }, + }); + + if ( + openAiDefaultSettings && + openAiDefaultSettings.openaiCredsId && + openAiDefaultSettings.speechToText && + received?.message?.audioMessage + ) { + messageRaw.message.speechToText = await this.openaiService.speechToText( + openAiDefaultSettings.OpenaiCreds, + received, + this.client.updateMediaMessage, + ); + } + } + this.logger.log(messageRaw); this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw); @@ -430,6 +454,24 @@ export class BusinessStartupService extends ChannelStartupService { ); } + if (this.configService.get('OPENAI').ENABLED) { + if (messageRaw.messageType !== 'reactionMessage') + await this.openaiService.sendOpenai( + { instanceName: this.instance.name, instanceId: this.instanceId }, + messageRaw.key.remoteJid, + messageRaw, + ); + } + + if (this.configService.get('DIFY').ENABLED) { + if (messageRaw.messageType !== 'reactionMessage') + await this.difyService.sendDify( + { instanceName: this.instance.name, instanceId: this.instanceId }, + messageRaw.key.remoteJid, + messageRaw, + ); + } + await this.prismaRepository.message.create({ data: messageRaw, });