mirror of
https://github.com/EvolutionAPI/evolution-audio-converter.git
synced 2025-07-13 07:04:51 -06:00
feat: add audio transcription feature with OpenAI and Groq support
- Updated .env.example to include transcription configuration options. - Enhanced main.go to support audio transcription, including new endpoints and logic for handling transcription requests. - Added functionality to transcribe audio using OpenAI and Groq APIs. - Updated README.md with detailed instructions on enabling and using the transcription feature.
This commit is contained in:
parent
2773e4749d
commit
30a4990e53
@ -1,3 +1,8 @@
|
||||
PORT=4040
|
||||
CORS_ALLOW_ORIGINS=*
|
||||
API_KEY=429683C4C977415CAAFCCE10F7D57E11
|
||||
API_KEY=429683C4C977415CAAFCCE10F7D57E11
|
||||
ENABLE_TRANSCRIPTION=true
|
||||
TRANSCRIPTION_PROVIDER=openai # ou groq
|
||||
OPENAI_API_KEY=sua_chave_openai_aqui
|
||||
GROK_API_KEY=sua_chave_groq_aqui
|
||||
TRANSCRIPTION_LANGUAGE=pt
|
||||
|
63
README.md
63
README.md
@ -57,7 +57,23 @@ PORT=4040
|
||||
API_KEY=your_secret_api_key_here
|
||||
```
|
||||
|
||||
This defines the port where the service will run.
|
||||
### Transcription Configuration
|
||||
|
||||
To enable audio transcription, configure the following variables in the `.env` file:
|
||||
|
||||
```env
|
||||
ENABLE_TRANSCRIPTION=true
|
||||
TRANSCRIPTION_PROVIDER=openai # or groq
|
||||
OPENAI_API_KEY=your_openai_key_here
|
||||
GROQ_API_KEY=your_groq_key_here
|
||||
TRANSCRIPTION_LANGUAGE=en # Default transcription language (optional)
|
||||
```
|
||||
|
||||
- `ENABLE_TRANSCRIPTION`: Enables or disables the transcription feature
|
||||
- `TRANSCRIPTION_PROVIDER`: Chooses the AI provider for transcription (openai or groq)
|
||||
- `OPENAI_API_KEY`: Your OpenAI API key (required if using openai)
|
||||
- `GROQ_API_KEY`: Your Groq API key (required if using groq)
|
||||
- `TRANSCRIPTION_LANGUAGE`: Sets the default transcription language (optional)
|
||||
|
||||
## Running the Project
|
||||
|
||||
@ -107,6 +123,51 @@ All requests must include the `apikey` header with the value of the `API_KEY` co
|
||||
- `mp3`
|
||||
- `ogg` (default)
|
||||
|
||||
### Audio Transcription
|
||||
|
||||
You can get the audio transcription in two ways:
|
||||
|
||||
1. Along with audio processing by adding the `transcribe=true` parameter:
|
||||
|
||||
```bash
|
||||
curl -X POST -F "file=@audio.mp3" \
|
||||
-F "transcribe=true" \
|
||||
-F "language=en" \
|
||||
http://localhost:4040/process-audio \
|
||||
-H "apikey: your_secret_api_key_here"
|
||||
```
|
||||
|
||||
2. Using the specific transcription endpoint:
|
||||
|
||||
```bash
|
||||
curl -X POST -F "file=@audio.mp3" \
|
||||
-F "language=en" \
|
||||
http://localhost:4040/transcribe \
|
||||
-H "apikey: your_secret_api_key_here"
|
||||
```
|
||||
|
||||
Optional parameters:
|
||||
- `language`: Audio language code (e.g., "en", "es", "pt"). If not specified, it will use the value defined in `TRANSCRIPTION_LANGUAGE` in `.env`. If neither is defined, the system will try to automatically detect the language.
|
||||
|
||||
The response will include the `transcription` field with the transcribed text:
|
||||
|
||||
```json
|
||||
{
|
||||
"transcription": "Transcribed text here..."
|
||||
}
|
||||
```
|
||||
|
||||
When used with audio processing (`/process-audio`), the response will include both audio data and transcription:
|
||||
|
||||
```json
|
||||
{
|
||||
"duration": 120,
|
||||
"audio": "UklGR... (base64 of the file)",
|
||||
"format": "ogg",
|
||||
"transcription": "Transcribed text here..."
|
||||
}
|
||||
```
|
||||
|
||||
### Example Requests Using cURL
|
||||
|
||||
#### Sending as Form-data
|
||||
|
281
main.go
281
main.go
@ -3,10 +3,12 @@ package main
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
@ -28,51 +30,62 @@ var (
|
||||
return new(bytes.Buffer)
|
||||
},
|
||||
}
|
||||
allowedOrigins []string
|
||||
allowedOrigins []string
|
||||
enableTranscription bool
|
||||
transcriptionProvider string
|
||||
openaiAPIKey string
|
||||
groqAPIKey string
|
||||
defaultTranscriptionLanguage string
|
||||
)
|
||||
|
||||
func init() {
|
||||
devMode := flag.Bool("dev", false, "Rodar em modo de desenvolvimento")
|
||||
devMode := flag.Bool("dev", false, "Run in development mode")
|
||||
flag.Parse()
|
||||
|
||||
if *devMode {
|
||||
err := godotenv.Load()
|
||||
if err != nil {
|
||||
fmt.Println("Erro ao carregar o arquivo .env")
|
||||
fmt.Println("Error loading .env file")
|
||||
} else {
|
||||
fmt.Println("Arquivo .env carregado com sucesso")
|
||||
fmt.Println(".env file loaded successfully")
|
||||
}
|
||||
}
|
||||
|
||||
apiKey = os.Getenv("API_KEY")
|
||||
if apiKey == "" {
|
||||
fmt.Println("API_KEY não configurada no arquivo .env")
|
||||
fmt.Println("API_KEY not configured in .env file")
|
||||
}
|
||||
|
||||
allowOriginsEnv := os.Getenv("CORS_ALLOW_ORIGINS")
|
||||
if allowOriginsEnv != "" {
|
||||
allowedOrigins = strings.Split(allowOriginsEnv, ",")
|
||||
fmt.Printf("Origens permitidas: %v\n", allowedOrigins)
|
||||
fmt.Printf("Allowed origins: %v\n", allowedOrigins)
|
||||
} else {
|
||||
allowedOrigins = []string{"*"}
|
||||
fmt.Println("Nenhuma origem específica configurada, permitindo todas (*)")
|
||||
fmt.Println("No specific origins configured, allowing all (*)")
|
||||
}
|
||||
|
||||
enableTranscription = os.Getenv("ENABLE_TRANSCRIPTION") == "true"
|
||||
transcriptionProvider = os.Getenv("TRANSCRIPTION_PROVIDER")
|
||||
openaiAPIKey = os.Getenv("OPENAI_API_KEY")
|
||||
groqAPIKey = os.Getenv("GROQ_API_KEY")
|
||||
defaultTranscriptionLanguage = os.Getenv("TRANSCRIPTION_LANGUAGE")
|
||||
}
|
||||
|
||||
func validateAPIKey(c *gin.Context) bool {
|
||||
if apiKey == "" {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Erro interno no servidor"})
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Internal server error"})
|
||||
return false
|
||||
}
|
||||
|
||||
requestApiKey := c.GetHeader("apikey")
|
||||
if requestApiKey == "" {
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "API_KEY não fornecida"})
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "API_KEY not provided"})
|
||||
return false
|
||||
}
|
||||
|
||||
if requestApiKey != apiKey {
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "API_KEY inválida"})
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "Invalid API_KEY"})
|
||||
return false
|
||||
}
|
||||
|
||||
@ -154,7 +167,176 @@ func getInputData(c *gin.Context) ([]byte, error) {
|
||||
return fetchAudioFromURL(url)
|
||||
}
|
||||
|
||||
return nil, errors.New("nenhum arquivo, base64 ou URL fornecido")
|
||||
return nil, errors.New("no file, base64 or URL provided")
|
||||
}
|
||||
|
||||
func transcribeAudio(audioData []byte, language string) (string, error) {
|
||||
if !enableTranscription {
|
||||
return "", errors.New("transcription is not enabled")
|
||||
}
|
||||
|
||||
switch transcriptionProvider {
|
||||
case "openai":
|
||||
return transcribeWithOpenAI(audioData, language)
|
||||
case "groq":
|
||||
return transcribeWithGroq(audioData, language)
|
||||
default:
|
||||
return "", errors.New("invalid transcription provider")
|
||||
}
|
||||
}
|
||||
|
||||
func transcribeWithOpenAI(audioData []byte, language string) (string, error) {
|
||||
if openaiAPIKey == "" {
|
||||
return "", errors.New("OpenAI API key not configured")
|
||||
}
|
||||
|
||||
// Se nenhum idioma foi especificado, use o padrão
|
||||
if language == "" {
|
||||
language = defaultTranscriptionLanguage
|
||||
}
|
||||
|
||||
// Salvar temporariamente o arquivo
|
||||
tempFile, err := os.CreateTemp("", "audio-*.ogg")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(tempFile.Name())
|
||||
|
||||
if _, err := tempFile.Write(audioData); err != nil {
|
||||
return "", err
|
||||
}
|
||||
tempFile.Close()
|
||||
|
||||
url := "https://api.openai.com/v1/audio/transcriptions"
|
||||
body := &bytes.Buffer{}
|
||||
writer := multipart.NewWriter(body)
|
||||
|
||||
// Adicionar o arquivo
|
||||
file, err := os.Open(tempFile.Name())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
part, err := writer.CreateFormFile("file", "audio.ogg")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
io.Copy(part, file)
|
||||
|
||||
// Adicionar modelo e idioma
|
||||
writer.WriteField("model", "whisper-1")
|
||||
if language != "" {
|
||||
writer.WriteField("language", language)
|
||||
}
|
||||
|
||||
writer.Close()
|
||||
|
||||
req, err := http.NewRequest("POST", url, body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("Authorization", "Bearer "+openaiAPIKey)
|
||||
req.Header.Set("Content-Type", writer.FormDataContentType())
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("erro na API OpenAI (status %d): %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return result.Text, nil
|
||||
}
|
||||
|
||||
func transcribeWithGroq(audioData []byte, language string) (string, error) {
|
||||
if groqAPIKey == "" {
|
||||
return "", errors.New("Groq API key not configured")
|
||||
}
|
||||
|
||||
// Se nenhum idioma foi especificado, use o padrão
|
||||
if language == "" {
|
||||
language = defaultTranscriptionLanguage
|
||||
}
|
||||
|
||||
// Salvar temporariamente o arquivo
|
||||
tempFile, err := os.CreateTemp("", "audio-*.ogg")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(tempFile.Name())
|
||||
|
||||
if _, err := tempFile.Write(audioData); err != nil {
|
||||
return "", err
|
||||
}
|
||||
tempFile.Close()
|
||||
|
||||
url := "https://api.groq.com/openai/v1/audio/transcriptions"
|
||||
body := &bytes.Buffer{}
|
||||
writer := multipart.NewWriter(body)
|
||||
|
||||
// Adicionar o arquivo
|
||||
file, err := os.Open(tempFile.Name())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
part, err := writer.CreateFormFile("file", "audio.ogg")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
io.Copy(part, file)
|
||||
|
||||
// Adicionar modelo e configurações
|
||||
writer.WriteField("model", "whisper-large-v3-turbo") // modelo mais rápido e com bom custo-benefício
|
||||
if language != "" {
|
||||
writer.WriteField("language", language)
|
||||
}
|
||||
writer.WriteField("response_format", "json")
|
||||
writer.WriteField("temperature", "0.0") // mais preciso
|
||||
|
||||
writer.Close()
|
||||
|
||||
req, err := http.NewRequest("POST", url, body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("Authorization", "Bearer "+groqAPIKey)
|
||||
req.Header.Set("Content-Type", writer.FormDataContentType())
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("erro na API Groq (status %d): %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return result.Text, nil
|
||||
}
|
||||
|
||||
func processAudio(c *gin.Context) {
|
||||
@ -176,16 +358,34 @@ func processAudio(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
var transcription string
|
||||
if c.DefaultPostForm("transcribe", "false") == "true" {
|
||||
language := c.DefaultPostForm("language", "")
|
||||
trans, err := transcribeAudio(convertedData, language)
|
||||
if err != nil {
|
||||
fmt.Printf("Erro na transcrição: %v\n", err)
|
||||
// Continua sem a transcrição
|
||||
} else {
|
||||
transcription = trans
|
||||
}
|
||||
}
|
||||
|
||||
response := gin.H{
|
||||
"duration": duration,
|
||||
"audio": base64.StdEncoding.EncodeToString(convertedData),
|
||||
"format": format,
|
||||
})
|
||||
}
|
||||
|
||||
if transcription != "" {
|
||||
response["transcription"] = transcription
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
func validateOrigin(origin string) bool {
|
||||
fmt.Printf("Validando origem: %s\n", origin)
|
||||
fmt.Printf("Origens permitidas: %v\n", allowedOrigins)
|
||||
fmt.Printf("Validating origin: %s\n", origin)
|
||||
fmt.Printf("Allowed origins: %v\n", allowedOrigins)
|
||||
|
||||
if len(allowedOrigins) == 0 {
|
||||
return true
|
||||
@ -203,41 +403,71 @@ func validateOrigin(origin string) bool {
|
||||
}
|
||||
|
||||
if allowed == origin {
|
||||
fmt.Printf("Origem %s corresponde a %s\n", origin, allowed)
|
||||
fmt.Printf("Origin %s matches %s\n", origin, allowed)
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("Origem %s não encontrada nas permitidas\n", origin)
|
||||
fmt.Printf("Origin %s not found in allowed origins\n", origin)
|
||||
return false
|
||||
}
|
||||
|
||||
func originMiddleware() gin.HandlerFunc {
|
||||
return func(c *gin.Context) {
|
||||
origin := c.Request.Header.Get("Origin")
|
||||
fmt.Printf("\n=== Debug CORS ===\n")
|
||||
fmt.Printf("Origin recebido: %s\n", origin)
|
||||
fmt.Printf("Headers completos: %+v\n", c.Request.Header)
|
||||
fmt.Printf("Origens permitidas: %v\n", allowedOrigins)
|
||||
fmt.Printf("\n=== CORS Debug ===\n")
|
||||
fmt.Printf("Received origin: %s\n", origin)
|
||||
fmt.Printf("Complete headers: %+v\n", c.Request.Header)
|
||||
fmt.Printf("Allowed origins: %v\n", allowedOrigins)
|
||||
fmt.Printf("=================\n")
|
||||
|
||||
if origin == "" {
|
||||
origin = c.Request.Header.Get("Referer")
|
||||
fmt.Printf("Origin vazio, usando Referer: %s\n", origin)
|
||||
fmt.Printf("Empty origin, using Referer: %s\n", origin)
|
||||
}
|
||||
|
||||
if !validateOrigin(origin) {
|
||||
fmt.Printf("❌ Origem rejeitada: %s\n", origin)
|
||||
c.JSON(http.StatusForbidden, gin.H{"error": "Origem não permitida"})
|
||||
fmt.Printf("❌ Origin rejected: %s\n", origin)
|
||||
c.JSON(http.StatusForbidden, gin.H{"error": "Origin not allowed"})
|
||||
c.Abort()
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Printf("✅ Origem aceita: %s\n", origin)
|
||||
fmt.Printf("✅ Origin accepted: %s\n", origin)
|
||||
c.Next()
|
||||
}
|
||||
}
|
||||
|
||||
func transcribeOnly(c *gin.Context) {
|
||||
if !validateAPIKey(c) {
|
||||
return
|
||||
}
|
||||
|
||||
inputData, err := getInputData(c)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Converter para ogg primeiro
|
||||
convertedData, _, err := convertAudio(inputData, "ogg")
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
language := c.DefaultPostForm("language", "")
|
||||
transcription, err := transcribeAudio(convertedData, language)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"transcription": transcription,
|
||||
})
|
||||
}
|
||||
|
||||
func main() {
|
||||
port := os.Getenv("PORT")
|
||||
if port == "" {
|
||||
@ -256,6 +486,7 @@ func main() {
|
||||
router.Use(originMiddleware())
|
||||
|
||||
router.POST("/process-audio", processAudio)
|
||||
router.POST("/transcribe", transcribeOnly)
|
||||
|
||||
router.Run(":" + port)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user