mirror of
https://github.com/EvolutionAPI/evolution-api.git
synced 2025-07-14 09:51:24 -06:00
Add speech-to-text functionality using OpenAI
This commit introduces a new feature that transcribes audio messages to text using OpenAI's Whisper model. The following files were modified to implement this feature: - `CHANGELOG.md`: Added a new entry under the 'Features' section to document the speech-to-text functionality. - `prisma/postgresql-schema.prisma`: Added a new boolean field `speechToText` to the `OpenaiSetting` model. - `src/api/integrations/openai/dto/openai.dto.ts`: Added a new optional boolean property `speechToText` to the `OpenaiSettingDto` class. - `src/api/integrations/openai/services/openai.service.ts`: Implemented the `speechToText` method to handle the transcription process. - `src/api/integrations/openai/validate/openai.schema.ts`: Added a new required boolean schema for the `speechToText` property in the `openaiSettingSchema`. - `src/api/integrations/typebot/services/typebot.service.ts`: Updated the `audioMessage` property to consider the new `speechToText` field. - `src/api/services/channels/whatsapp.baileys.service.ts` and `src/api/services/channels/whatsapp.business.service.ts`: Added logic to handle the transcription of audio messages when the `speechToText` setting is enabled. The purpose of this change is to provide a more accessible way for users to interact with audio messages by converting them to text. This improvement will be particularly useful for users with hearing impairments or those in noisy environments.
This commit is contained in:
parent
a73b74ceaa
commit
67409e1bf5
@ -1,5 +1,9 @@
|
|||||||
# 2.0.5-rc (release candidate)
|
# 2.0.5-rc (release candidate)
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
* Speech to Text with Openai
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
* ClientName on infos
|
* ClientName on infos
|
||||||
|
@ -0,0 +1,2 @@
|
|||||||
|
-- AlterTable
|
||||||
|
ALTER TABLE "OpenaiSetting" ADD COLUMN "speechToText" BOOLEAN DEFAULT false;
|
@ -422,6 +422,7 @@ model OpenaiSetting {
|
|||||||
keepOpen Boolean? @default(false) @db.Boolean
|
keepOpen Boolean? @default(false) @db.Boolean
|
||||||
debounceTime Int? @db.Integer
|
debounceTime Int? @db.Integer
|
||||||
ignoreJids Json?
|
ignoreJids Json?
|
||||||
|
speechToText Boolean? @default(false) @db.Boolean
|
||||||
createdAt DateTime? @default(now()) @db.Timestamp
|
createdAt DateTime? @default(now()) @db.Timestamp
|
||||||
updatedAt DateTime @updatedAt @db.Timestamp
|
updatedAt DateTime @updatedAt @db.Timestamp
|
||||||
OpenaiCreds OpenaiCreds? @relation(fields: [openaiCredsId], references: [id])
|
OpenaiCreds OpenaiCreds? @relation(fields: [openaiCredsId], references: [id])
|
||||||
|
@ -670,7 +670,11 @@ export class DifyService {
|
|||||||
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
||||||
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
||||||
// Medias
|
// Medias
|
||||||
audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined,
|
audioMessage: msg?.message?.speechToText
|
||||||
|
? msg?.message?.speechToText
|
||||||
|
: msg?.message?.audioMessage
|
||||||
|
? `audioMessage|${mediaId}`
|
||||||
|
: undefined,
|
||||||
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
|
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
|
||||||
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
|
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
|
||||||
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,
|
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,
|
||||||
|
@ -49,6 +49,7 @@ export class OpenaiSettingDto {
|
|||||||
debounceTime?: number;
|
debounceTime?: number;
|
||||||
openaiIdFallback?: string;
|
openaiIdFallback?: string;
|
||||||
ignoreJids?: any;
|
ignoreJids?: any;
|
||||||
|
speechToText?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class OpenaiIgnoreJidDto {
|
export class OpenaiIgnoreJidDto {
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
import { Message, OpenaiBot, OpenaiSession, OpenaiSetting } from '@prisma/client';
|
import { Message, OpenaiBot, OpenaiCreds, OpenaiSession, OpenaiSetting } from '@prisma/client';
|
||||||
|
import axios from 'axios';
|
||||||
|
import { downloadMediaMessage } from 'baileys';
|
||||||
|
import FormData from 'form-data';
|
||||||
import OpenAI from 'openai';
|
import OpenAI from 'openai';
|
||||||
|
import P from 'pino';
|
||||||
|
|
||||||
import { ConfigService, S3 } from '../../../../config/env.config';
|
import { ConfigService, Language, S3 } from '../../../../config/env.config';
|
||||||
import { Logger } from '../../../../config/logger.config';
|
import { Logger } from '../../../../config/logger.config';
|
||||||
import { sendTelemetry } from '../../../../utils/sendTelemetry';
|
import { sendTelemetry } from '../../../../utils/sendTelemetry';
|
||||||
import { InstanceDto } from '../../../dto/instance.dto';
|
import { InstanceDto } from '../../../dto/instance.dto';
|
||||||
@ -528,6 +532,7 @@ export class OpenaiService {
|
|||||||
stopBotFromMe: data.stopBotFromMe,
|
stopBotFromMe: data.stopBotFromMe,
|
||||||
keepOpen: data.keepOpen,
|
keepOpen: data.keepOpen,
|
||||||
debounceTime: data.debounceTime,
|
debounceTime: data.debounceTime,
|
||||||
|
speechToText: data.speechToText,
|
||||||
openaiIdFallback: data.openaiIdFallback,
|
openaiIdFallback: data.openaiIdFallback,
|
||||||
ignoreJids: data.ignoreJids,
|
ignoreJids: data.ignoreJids,
|
||||||
},
|
},
|
||||||
@ -543,6 +548,7 @@ export class OpenaiService {
|
|||||||
stopBotFromMe: updateSettings.stopBotFromMe,
|
stopBotFromMe: updateSettings.stopBotFromMe,
|
||||||
keepOpen: updateSettings.keepOpen,
|
keepOpen: updateSettings.keepOpen,
|
||||||
debounceTime: updateSettings.debounceTime,
|
debounceTime: updateSettings.debounceTime,
|
||||||
|
speechToText: updateSettings.speechToText,
|
||||||
openaiIdFallback: updateSettings.openaiIdFallback,
|
openaiIdFallback: updateSettings.openaiIdFallback,
|
||||||
ignoreJids: updateSettings.ignoreJids,
|
ignoreJids: updateSettings.ignoreJids,
|
||||||
};
|
};
|
||||||
@ -561,6 +567,7 @@ export class OpenaiService {
|
|||||||
debounceTime: data.debounceTime,
|
debounceTime: data.debounceTime,
|
||||||
openaiIdFallback: data.openaiIdFallback,
|
openaiIdFallback: data.openaiIdFallback,
|
||||||
ignoreJids: data.ignoreJids,
|
ignoreJids: data.ignoreJids,
|
||||||
|
speechToText: data.speechToText,
|
||||||
instanceId: instanceId,
|
instanceId: instanceId,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@ -615,6 +622,7 @@ export class OpenaiService {
|
|||||||
keepOpen: false,
|
keepOpen: false,
|
||||||
ignoreJids: [],
|
ignoreJids: [],
|
||||||
openaiIdFallback: null,
|
openaiIdFallback: null,
|
||||||
|
speechToText: false,
|
||||||
fallback: null,
|
fallback: null,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -630,6 +638,7 @@ export class OpenaiService {
|
|||||||
keepOpen: settings.keepOpen,
|
keepOpen: settings.keepOpen,
|
||||||
ignoreJids: settings.ignoreJids,
|
ignoreJids: settings.ignoreJids,
|
||||||
openaiIdFallback: settings.openaiIdFallback,
|
openaiIdFallback: settings.openaiIdFallback,
|
||||||
|
speechToText: settings.speechToText,
|
||||||
fallback: settings.Fallback,
|
fallback: settings.Fallback,
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -823,7 +832,11 @@ export class OpenaiService {
|
|||||||
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
||||||
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
||||||
// Medias
|
// Medias
|
||||||
audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined,
|
audioMessage: msg?.message?.speechToText
|
||||||
|
? msg?.message?.speechToText
|
||||||
|
: msg?.message?.audioMessage
|
||||||
|
? `audioMessage|${mediaId}`
|
||||||
|
: undefined,
|
||||||
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
|
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
|
||||||
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
|
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
|
||||||
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,
|
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,
|
||||||
@ -1779,4 +1792,43 @@ export class OpenaiService {
|
|||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async speechToText(creds: OpenaiCreds, msg: any, updateMediaMessage: any) {
|
||||||
|
let audio;
|
||||||
|
|
||||||
|
if (msg?.message?.mediaUrl) {
|
||||||
|
audio = await axios.get(msg.message.mediaUrl, { responseType: 'arraybuffer' }).then((response) => {
|
||||||
|
return Buffer.from(response.data, 'binary');
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
audio = await downloadMediaMessage(
|
||||||
|
{ key: msg.key, message: msg?.message },
|
||||||
|
'buffer',
|
||||||
|
{},
|
||||||
|
{
|
||||||
|
logger: P({ level: 'error' }) as any,
|
||||||
|
reuploadRequest: updateMediaMessage,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const lang = this.configService.get<Language>('LANGUAGE').includes('pt')
|
||||||
|
? 'pt'
|
||||||
|
: this.configService.get<Language>('LANGUAGE');
|
||||||
|
|
||||||
|
const formData = new FormData();
|
||||||
|
|
||||||
|
formData.append('file', audio, 'audio.ogg');
|
||||||
|
formData.append('model', 'whisper-1');
|
||||||
|
formData.append('language', lang);
|
||||||
|
|
||||||
|
const response = await axios.post('https://api.openai.com/v1/audio/transcriptions', formData, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'multipart/form-data',
|
||||||
|
Authorization: `Bearer ${creds.apiKey}`,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return response?.data?.text;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -85,6 +85,7 @@ export const openaiSettingSchema: JSONSchema7 = {
|
|||||||
stopBotFromMe: { type: 'boolean' },
|
stopBotFromMe: { type: 'boolean' },
|
||||||
keepOpen: { type: 'boolean' },
|
keepOpen: { type: 'boolean' },
|
||||||
debounceTime: { type: 'integer' },
|
debounceTime: { type: 'integer' },
|
||||||
|
speechToText: { type: 'boolean' },
|
||||||
ignoreJids: { type: 'array', items: { type: 'string' } },
|
ignoreJids: { type: 'array', items: { type: 'string' } },
|
||||||
openaiIdFallback: { type: 'string' },
|
openaiIdFallback: { type: 'string' },
|
||||||
},
|
},
|
||||||
|
@ -931,7 +931,11 @@ export class TypebotService {
|
|||||||
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
listResponseMessage: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
||||||
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
responseRowId: msg?.message?.listResponseMessage?.singleSelectReply?.selectedRowId,
|
||||||
// Medias
|
// Medias
|
||||||
audioMessage: msg?.message?.audioMessage ? `audioMessage|${mediaId}` : undefined,
|
audioMessage: msg?.message?.speechToText
|
||||||
|
? msg?.message?.speechToText
|
||||||
|
: msg?.message?.audioMessage
|
||||||
|
? `audioMessage|${mediaId}`
|
||||||
|
: undefined,
|
||||||
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
|
imageMessage: msg?.message?.imageMessage ? `imageMessage|${mediaId}` : undefined,
|
||||||
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
|
videoMessage: msg?.message?.videoMessage ? `videoMessage|${mediaId}` : undefined,
|
||||||
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,
|
documentMessage: msg?.message?.documentMessage ? `documentMessage|${mediaId}` : undefined,
|
||||||
|
@ -1161,6 +1161,30 @@ export class BaileysStartupService extends ChannelStartupService {
|
|||||||
messageRaw.message.base64 = buffer ? buffer.toString('base64') : undefined;
|
messageRaw.message.base64 = buffer ? buffer.toString('base64') : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.configService.get<Openai>('OPENAI').ENABLED) {
|
||||||
|
const openAiDefaultSettings = await this.prismaRepository.openaiSetting.findFirst({
|
||||||
|
where: {
|
||||||
|
instanceId: this.instanceId,
|
||||||
|
},
|
||||||
|
include: {
|
||||||
|
OpenaiCreds: true,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (
|
||||||
|
openAiDefaultSettings &&
|
||||||
|
openAiDefaultSettings.openaiCredsId &&
|
||||||
|
openAiDefaultSettings.speechToText &&
|
||||||
|
received?.message?.audioMessage
|
||||||
|
) {
|
||||||
|
messageRaw.message.speechToText = await this.openaiService.speechToText(
|
||||||
|
openAiDefaultSettings.OpenaiCreds,
|
||||||
|
received,
|
||||||
|
this.client.updateMediaMessage,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
this.logger.log(messageRaw);
|
this.logger.log(messageRaw);
|
||||||
|
|
||||||
this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw);
|
this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw);
|
||||||
|
@ -5,7 +5,7 @@ import FormData from 'form-data';
|
|||||||
import { createReadStream } from 'fs';
|
import { createReadStream } from 'fs';
|
||||||
import { getMIMEType } from 'node-mime-types';
|
import { getMIMEType } from 'node-mime-types';
|
||||||
|
|
||||||
import { Chatwoot, ConfigService, Database, Typebot, WaBusiness } from '../../../config/env.config';
|
import { Chatwoot, ConfigService, Database, Dify, Openai, Typebot, WaBusiness } from '../../../config/env.config';
|
||||||
import { BadRequestException, InternalServerErrorException } from '../../../exceptions';
|
import { BadRequestException, InternalServerErrorException } from '../../../exceptions';
|
||||||
import { NumberBusiness } from '../../dto/chat.dto';
|
import { NumberBusiness } from '../../dto/chat.dto';
|
||||||
import {
|
import {
|
||||||
@ -403,6 +403,30 @@ export class BusinessStartupService extends ChannelStartupService {
|
|||||||
// await this.client.readMessages([received.key]);
|
// await this.client.readMessages([received.key]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.configService.get<Openai>('OPENAI').ENABLED) {
|
||||||
|
const openAiDefaultSettings = await this.prismaRepository.openaiSetting.findFirst({
|
||||||
|
where: {
|
||||||
|
instanceId: this.instanceId,
|
||||||
|
},
|
||||||
|
include: {
|
||||||
|
OpenaiCreds: true,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (
|
||||||
|
openAiDefaultSettings &&
|
||||||
|
openAiDefaultSettings.openaiCredsId &&
|
||||||
|
openAiDefaultSettings.speechToText &&
|
||||||
|
received?.message?.audioMessage
|
||||||
|
) {
|
||||||
|
messageRaw.message.speechToText = await this.openaiService.speechToText(
|
||||||
|
openAiDefaultSettings.OpenaiCreds,
|
||||||
|
received,
|
||||||
|
this.client.updateMediaMessage,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
this.logger.log(messageRaw);
|
this.logger.log(messageRaw);
|
||||||
|
|
||||||
this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw);
|
this.sendDataWebhook(Events.MESSAGES_UPSERT, messageRaw);
|
||||||
@ -430,6 +454,24 @@ export class BusinessStartupService extends ChannelStartupService {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.configService.get<Openai>('OPENAI').ENABLED) {
|
||||||
|
if (messageRaw.messageType !== 'reactionMessage')
|
||||||
|
await this.openaiService.sendOpenai(
|
||||||
|
{ instanceName: this.instance.name, instanceId: this.instanceId },
|
||||||
|
messageRaw.key.remoteJid,
|
||||||
|
messageRaw,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.configService.get<Dify>('DIFY').ENABLED) {
|
||||||
|
if (messageRaw.messageType !== 'reactionMessage')
|
||||||
|
await this.difyService.sendDify(
|
||||||
|
{ instanceName: this.instance.name, instanceId: this.instanceId },
|
||||||
|
messageRaw.key.remoteJid,
|
||||||
|
messageRaw,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
await this.prismaRepository.message.create({
|
await this.prismaRepository.message.create({
|
||||||
data: messageRaw,
|
data: messageRaw,
|
||||||
});
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user