From 30a4990e539c280662281c767925d387b9e7ac4b Mon Sep 17 00:00:00 2001
From: Davidson Gomes <davidsongviolao@gmail.com>
Date: Mon, 2 Dec 2024 19:49:41 -0300
Subject: [PATCH] feat: add audio transcription feature with OpenAI and Groq
 support

- Updated .env.example to include transcription configuration options.
- Enhanced main.go to support audio transcription, including new endpoints and logic for handling transcription requests.
- Added functionality to transcribe audio using OpenAI and Groq APIs.
- Updated README.md with detailed instructions on enabling and using the transcription feature.
---
 .env.example |   7 +-
 README.md    |  63 +++++++++++-
 main.go      | 281 ++++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 324 insertions(+), 27 deletions(-)

diff --git a/.env.example b/.env.example
index ce3ca5f..b665449 100644
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,8 @@
 PORT=4040
 CORS_ALLOW_ORIGINS=*
-API_KEY=429683C4C977415CAAFCCE10F7D57E11
\ No newline at end of file
+API_KEY=429683C4C977415CAAFCCE10F7D57E11
+ENABLE_TRANSCRIPTION=true
+TRANSCRIPTION_PROVIDER=openai  # ou groq
+OPENAI_API_KEY=sua_chave_openai_aqui
+GROK_API_KEY=sua_chave_groq_aqui
+TRANSCRIPTION_LANGUAGE=pt
diff --git a/README.md b/README.md
index eb97e5b..ae94ffc 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,23 @@ PORT=4040
 API_KEY=your_secret_api_key_here
 ```
 
-This defines the port where the service will run.
+### Transcription Configuration
+
+To enable audio transcription, configure the following variables in the `.env` file:
+
+```env
+ENABLE_TRANSCRIPTION=true
+TRANSCRIPTION_PROVIDER=openai  # or groq
+OPENAI_API_KEY=your_openai_key_here
+GROQ_API_KEY=your_groq_key_here
+TRANSCRIPTION_LANGUAGE=en  # Default transcription language (optional)
+```
+
+- `ENABLE_TRANSCRIPTION`: Enables or disables the transcription feature
+- `TRANSCRIPTION_PROVIDER`: Chooses the AI provider for transcription (openai or groq)
+- `OPENAI_API_KEY`: Your OpenAI API key (required if using openai)
+- `GROQ_API_KEY`: Your Groq API key (required if using groq)
+- `TRANSCRIPTION_LANGUAGE`: Sets the default transcription language (optional)
 
 ## Running the Project
 
@@ -107,6 +123,51 @@ All requests must include the `apikey` header with the value of the `API_KEY` co
   - `mp3`
   - `ogg` (default)
 
+### Audio Transcription
+
+You can get the audio transcription in two ways:
+
+1. Along with audio processing by adding the `transcribe=true` parameter:
+
+```bash
+curl -X POST -F "file=@audio.mp3" \
+  -F "transcribe=true" \
+  -F "language=en" \
+  http://localhost:4040/process-audio \
+  -H "apikey: your_secret_api_key_here"
+```
+
+2. Using the specific transcription endpoint:
+
+```bash
+curl -X POST -F "file=@audio.mp3" \
+  -F "language=en" \
+  http://localhost:4040/transcribe \
+  -H "apikey: your_secret_api_key_here"
+```
+
+Optional parameters:
+- `language`: Audio language code (e.g., "en", "es", "pt"). If not specified, it will use the value defined in `TRANSCRIPTION_LANGUAGE` in `.env`. If neither is defined, the system will try to automatically detect the language.
+
+The response will include the `transcription` field with the transcribed text:
+
+```json
+{
+  "transcription": "Transcribed text here..."
+}
+```
+
+When used with audio processing (`/process-audio`), the response will include both audio data and transcription:
+
+```json
+{
+  "duration": 120,
+  "audio": "UklGR... (base64 of the file)",
+  "format": "ogg",
+  "transcription": "Transcribed text here..."
+}
+```
+
 ### Example Requests Using cURL
 
 #### Sending as Form-data
diff --git a/main.go b/main.go
index 9a92a3e..ec8b84d 100644
--- a/main.go
+++ b/main.go
@@ -3,10 +3,12 @@ package main
 import (
 	"bytes"
 	"encoding/base64"
+	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
 	"io"
+	"mime/multipart"
 	"net/http"
 	"os"
 	"os/exec"
@@ -28,51 +30,62 @@ var (
 			return new(bytes.Buffer)
 		},
 	}
-	allowedOrigins []string
+	allowedOrigins               []string
+	enableTranscription          bool
+	transcriptionProvider        string
+	openaiAPIKey                 string
+	groqAPIKey                   string
+	defaultTranscriptionLanguage string
 )
 
 func init() {
-	devMode := flag.Bool("dev", false, "Rodar em modo de desenvolvimento")
+	devMode := flag.Bool("dev", false, "Run in development mode")
 	flag.Parse()
 
 	if *devMode {
 		err := godotenv.Load()
 		if err != nil {
-			fmt.Println("Erro ao carregar o arquivo .env")
+			fmt.Println("Error loading .env file")
 		} else {
-			fmt.Println("Arquivo .env carregado com sucesso")
+			fmt.Println(".env file loaded successfully")
 		}
 	}
 
 	apiKey = os.Getenv("API_KEY")
 	if apiKey == "" {
-		fmt.Println("API_KEY não configurada no arquivo .env")
+		fmt.Println("API_KEY not configured in .env file")
 	}
 
 	allowOriginsEnv := os.Getenv("CORS_ALLOW_ORIGINS")
 	if allowOriginsEnv != "" {
 		allowedOrigins = strings.Split(allowOriginsEnv, ",")
-		fmt.Printf("Origens permitidas: %v\n", allowedOrigins)
+		fmt.Printf("Allowed origins: %v\n", allowedOrigins)
 	} else {
 		allowedOrigins = []string{"*"}
-		fmt.Println("Nenhuma origem específica configurada, permitindo todas (*)")
+		fmt.Println("No specific origins configured, allowing all (*)")
 	}
+
+	enableTranscription = os.Getenv("ENABLE_TRANSCRIPTION") == "true"
+	transcriptionProvider = os.Getenv("TRANSCRIPTION_PROVIDER")
+	openaiAPIKey = os.Getenv("OPENAI_API_KEY")
+	groqAPIKey = os.Getenv("GROQ_API_KEY")
+	defaultTranscriptionLanguage = os.Getenv("TRANSCRIPTION_LANGUAGE")
 }
 
 func validateAPIKey(c *gin.Context) bool {
 	if apiKey == "" {
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "Erro interno no servidor"})
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Internal server error"})
 		return false
 	}
 
 	requestApiKey := c.GetHeader("apikey")
 	if requestApiKey == "" {
-		c.JSON(http.StatusUnauthorized, gin.H{"error": "API_KEY não fornecida"})
+		c.JSON(http.StatusUnauthorized, gin.H{"error": "API_KEY not provided"})
 		return false
 	}
 
 	if requestApiKey != apiKey {
-		c.JSON(http.StatusUnauthorized, gin.H{"error": "API_KEY inválida"})
+		c.JSON(http.StatusUnauthorized, gin.H{"error": "Invalid API_KEY"})
 		return false
 	}
 
@@ -154,7 +167,176 @@ func getInputData(c *gin.Context) ([]byte, error) {
 		return fetchAudioFromURL(url)
 	}
 
-	return nil, errors.New("nenhum arquivo, base64 ou URL fornecido")
+	return nil, errors.New("no file, base64 or URL provided")
+}
+
+func transcribeAudio(audioData []byte, language string) (string, error) {
+	if !enableTranscription {
+		return "", errors.New("transcription is not enabled")
+	}
+
+	switch transcriptionProvider {
+	case "openai":
+		return transcribeWithOpenAI(audioData, language)
+	case "groq":
+		return transcribeWithGroq(audioData, language)
+	default:
+		return "", errors.New("invalid transcription provider")
+	}
+}
+
+func transcribeWithOpenAI(audioData []byte, language string) (string, error) {
+	if openaiAPIKey == "" {
+		return "", errors.New("OpenAI API key not configured")
+	}
+
+	// Se nenhum idioma foi especificado, use o padrão
+	if language == "" {
+		language = defaultTranscriptionLanguage
+	}
+
+	// Salvar temporariamente o arquivo
+	tempFile, err := os.CreateTemp("", "audio-*.ogg")
+	if err != nil {
+		return "", err
+	}
+	defer os.Remove(tempFile.Name())
+
+	if _, err := tempFile.Write(audioData); err != nil {
+		return "", err
+	}
+	tempFile.Close()
+
+	url := "https://api.openai.com/v1/audio/transcriptions"
+	body := &bytes.Buffer{}
+	writer := multipart.NewWriter(body)
+
+	// Adicionar o arquivo
+	file, err := os.Open(tempFile.Name())
+	if err != nil {
+		return "", err
+	}
+	defer file.Close()
+
+	part, err := writer.CreateFormFile("file", "audio.ogg")
+	if err != nil {
+		return "", err
+	}
+	io.Copy(part, file)
+
+	// Adicionar modelo e idioma
+	writer.WriteField("model", "whisper-1")
+	if language != "" {
+		writer.WriteField("language", language)
+	}
+
+	writer.Close()
+
+	req, err := http.NewRequest("POST", url, body)
+	if err != nil {
+		return "", err
+	}
+
+	req.Header.Set("Authorization", "Bearer "+openaiAPIKey)
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		bodyBytes, _ := io.ReadAll(resp.Body)
+		return "", fmt.Errorf("erro na API OpenAI (status %d): %s", resp.StatusCode, string(bodyBytes))
+	}
+
+	var result struct {
+		Text string `json:"text"`
+	}
+	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
+		return "", err
+	}
+
+	return result.Text, nil
+}
+
+func transcribeWithGroq(audioData []byte, language string) (string, error) {
+	if groqAPIKey == "" {
+		return "", errors.New("Groq API key not configured")
+	}
+
+	// Se nenhum idioma foi especificado, use o padrão
+	if language == "" {
+		language = defaultTranscriptionLanguage
+	}
+
+	// Salvar temporariamente o arquivo
+	tempFile, err := os.CreateTemp("", "audio-*.ogg")
+	if err != nil {
+		return "", err
+	}
+	defer os.Remove(tempFile.Name())
+
+	if _, err := tempFile.Write(audioData); err != nil {
+		return "", err
+	}
+	tempFile.Close()
+
+	url := "https://api.groq.com/openai/v1/audio/transcriptions"
+	body := &bytes.Buffer{}
+	writer := multipart.NewWriter(body)
+
+	// Adicionar o arquivo
+	file, err := os.Open(tempFile.Name())
+	if err != nil {
+		return "", err
+	}
+	defer file.Close()
+
+	part, err := writer.CreateFormFile("file", "audio.ogg")
+	if err != nil {
+		return "", err
+	}
+	io.Copy(part, file)
+
+	// Adicionar modelo e configurações
+	writer.WriteField("model", "whisper-large-v3-turbo") // modelo mais rápido e com bom custo-benefício
+	if language != "" {
+		writer.WriteField("language", language)
+	}
+	writer.WriteField("response_format", "json")
+	writer.WriteField("temperature", "0.0") // mais preciso
+
+	writer.Close()
+
+	req, err := http.NewRequest("POST", url, body)
+	if err != nil {
+		return "", err
+	}
+
+	req.Header.Set("Authorization", "Bearer "+groqAPIKey)
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		bodyBytes, _ := io.ReadAll(resp.Body)
+		return "", fmt.Errorf("erro na API Groq (status %d): %s", resp.StatusCode, string(bodyBytes))
+	}
+
+	var result struct {
+		Text string `json:"text"`
+	}
+	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
+		return "", err
+	}
+
+	return result.Text, nil
 }
 
 func processAudio(c *gin.Context) {
@@ -176,16 +358,34 @@ func processAudio(c *gin.Context) {
 		return
 	}
 
-	c.JSON(http.StatusOK, gin.H{
+	var transcription string
+	if c.DefaultPostForm("transcribe", "false") == "true" {
+		language := c.DefaultPostForm("language", "")
+		trans, err := transcribeAudio(convertedData, language)
+		if err != nil {
+			fmt.Printf("Erro na transcrição: %v\n", err)
+			// Continua sem a transcrição
+		} else {
+			transcription = trans
+		}
+	}
+
+	response := gin.H{
 		"duration": duration,
 		"audio":    base64.StdEncoding.EncodeToString(convertedData),
 		"format":   format,
-	})
+	}
+
+	if transcription != "" {
+		response["transcription"] = transcription
+	}
+
+	c.JSON(http.StatusOK, response)
 }
 
 func validateOrigin(origin string) bool {
-	fmt.Printf("Validando origem: %s\n", origin)
-	fmt.Printf("Origens permitidas: %v\n", allowedOrigins)
+	fmt.Printf("Validating origin: %s\n", origin)
+	fmt.Printf("Allowed origins: %v\n", allowedOrigins)
 
 	if len(allowedOrigins) == 0 {
 		return true
@@ -203,41 +403,71 @@ func validateOrigin(origin string) bool {
 		}
 
 		if allowed == origin {
-			fmt.Printf("Origem %s corresponde a %s\n", origin, allowed)
+			fmt.Printf("Origin %s matches %s\n", origin, allowed)
 			return true
 		}
 	}
 
-	fmt.Printf("Origem %s não encontrada nas permitidas\n", origin)
+	fmt.Printf("Origin %s not found in allowed origins\n", origin)
 	return false
 }
 
 func originMiddleware() gin.HandlerFunc {
 	return func(c *gin.Context) {
 		origin := c.Request.Header.Get("Origin")
-		fmt.Printf("\n=== Debug CORS ===\n")
-		fmt.Printf("Origin recebido: %s\n", origin)
-		fmt.Printf("Headers completos: %+v\n", c.Request.Header)
-		fmt.Printf("Origens permitidas: %v\n", allowedOrigins)
+		fmt.Printf("\n=== CORS Debug ===\n")
+		fmt.Printf("Received origin: %s\n", origin)
+		fmt.Printf("Complete headers: %+v\n", c.Request.Header)
+		fmt.Printf("Allowed origins: %v\n", allowedOrigins)
 		fmt.Printf("=================\n")
 
 		if origin == "" {
 			origin = c.Request.Header.Get("Referer")
-			fmt.Printf("Origin vazio, usando Referer: %s\n", origin)
+			fmt.Printf("Empty origin, using Referer: %s\n", origin)
 		}
 
 		if !validateOrigin(origin) {
-			fmt.Printf("❌ Origem rejeitada: %s\n", origin)
-			c.JSON(http.StatusForbidden, gin.H{"error": "Origem não permitida"})
+			fmt.Printf("❌ Origin rejected: %s\n", origin)
+			c.JSON(http.StatusForbidden, gin.H{"error": "Origin not allowed"})
 			c.Abort()
 			return
 		}
 
-		fmt.Printf("✅ Origem aceita: %s\n", origin)
+		fmt.Printf("✅ Origin accepted: %s\n", origin)
 		c.Next()
 	}
 }
 
+func transcribeOnly(c *gin.Context) {
+	if !validateAPIKey(c) {
+		return
+	}
+
+	inputData, err := getInputData(c)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	// Converter para ogg primeiro
+	convertedData, _, err := convertAudio(inputData, "ogg")
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	language := c.DefaultPostForm("language", "")
+	transcription, err := transcribeAudio(convertedData, language)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	c.JSON(http.StatusOK, gin.H{
+		"transcription": transcription,
+	})
+}
+
 func main() {
 	port := os.Getenv("PORT")
 	if port == "" {
@@ -256,6 +486,7 @@ func main() {
 	router.Use(originMiddleware())
 
 	router.POST("/process-audio", processAudio)
+	router.POST("/transcribe", transcribeOnly)
 
 	router.Run(":" + port)
 }