Docling/.actor/actor.sh
Václav Vančura 14d4f5b109
fix(integration): update the Apify Actor integration (#1619)
* fix(actor): remove references to missing docling_processor.py

Signed-off-by: Václav Vančura <commit@vancura.dev>

* chore(actor): update Actor README.md with recent repo URL changes

Signed-off-by: Václav Vančura <commit@vancura.dev>

* chore(actor): improve the Actor README.md local header link

Signed-off-by: Václav Vančura <commit@vancura.dev>

* chore(actor): bump the Actor version number

Signed-off-by: Václav Vančura <commit@vancura.dev>

* Update .actor/actor.json

Co-authored-by: Marek Trunkát <marek@trunkat.eu>
Signed-off-by: Jan Čurn <jan.curn@gmail.com>

---------

Signed-off-by: Václav Vančura <commit@vancura.dev>
Signed-off-by: Jan Čurn <jan.curn@gmail.com>
Co-authored-by: Jan Čurn <jan.curn@gmail.com>
Co-authored-by: Marek Trunkát <marek@trunkat.eu>
2025-05-21 02:47:55 +02:00

409 lines
13 KiB
Bash
Executable File

#!/bin/bash
export PATH=$PATH:/build-files/node_modules/.bin
# Function to upload content to the key-value store
upload_to_kvs() {
local content_file="$1"
local key_name="$2"
local content_type="$3"
local description="$4"
# Find the Apify CLI command
find_apify_cmd
local apify_cmd="$FOUND_APIFY_CMD"
if [ -n "$apify_cmd" ]; then
echo "Uploading $description to key-value store (key: $key_name)..."
# Create a temporary home directory with write permissions
setup_temp_environment
# Use the --no-update-notifier flag if available
if $apify_cmd --help | grep -q "\--no-update-notifier"; then
if $apify_cmd --no-update-notifier actor:set-value "$key_name" --contentType "$content_type" < "$content_file"; then
echo "Successfully uploaded $description to key-value store"
local url="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/$key_name"
echo "$description available at: $url"
cleanup_temp_environment
return 0
fi
else
# Fall back to regular command if flag isn't available
if $apify_cmd actor:set-value "$key_name" --contentType "$content_type" < "$content_file"; then
echo "Successfully uploaded $description to key-value store"
local url="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/$key_name"
echo "$description available at: $url"
cleanup_temp_environment
return 0
fi
fi
echo "ERROR: Failed to upload $description to key-value store"
cleanup_temp_environment
return 1
else
echo "ERROR: Apify CLI not found for $description upload"
return 1
fi
}
# Function to find Apify CLI command
find_apify_cmd() {
FOUND_APIFY_CMD=""
for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do
if command -v "$cmd" &> /dev/null; then
FOUND_APIFY_CMD="$cmd"
break
fi
done
}
# Function to set up temporary environment for Apify CLI
setup_temp_environment() {
export TMPDIR="/tmp/apify-home-${RANDOM}"
mkdir -p "$TMPDIR"
export APIFY_DISABLE_VERSION_CHECK=1
export NODE_OPTIONS="--no-warnings"
export HOME="$TMPDIR" # Override home directory to writable location
}
# Function to clean up temporary environment
cleanup_temp_environment() {
rm -rf "$TMPDIR" 2>/dev/null || true
}
# Function to push data to Apify dataset
push_to_dataset() {
# Example usage: push_to_dataset "$RESULT_URL" "$OUTPUT_SIZE" "zip"
local result_url="$1"
local size="$2"
local format="$3"
# Find Apify CLI command
find_apify_cmd
local apify_cmd="$FOUND_APIFY_CMD"
if [ -n "$apify_cmd" ]; then
echo "Adding record to dataset..."
setup_temp_environment
# Use the --no-update-notifier flag if available
if $apify_cmd --help | grep -q "\--no-update-notifier"; then
if $apify_cmd --no-update-notifier actor:push-data "{\"output_file\": \"${result_url}\", \"format\": \"${format}\", \"size\": \"${size}\", \"status\": \"success\"}"; then
echo "Successfully added record to dataset"
else
echo "Warning: Failed to add record to dataset"
fi
else
# Fall back to regular command
if $apify_cmd actor:push-data "{\"output_file\": \"${result_url}\", \"format\": \"${format}\", \"size\": \"${size}\", \"status\": \"success\"}"; then
echo "Successfully added record to dataset"
else
echo "Warning: Failed to add record to dataset"
fi
fi
cleanup_temp_environment
fi
}
# --- Setup logging and error handling ---
LOG_FILE="/tmp/docling.log"
touch "$LOG_FILE" || {
echo "Fatal: Cannot create log file at $LOG_FILE"
exit 1
}
# Log to both console and file
exec 1> >(tee -a "$LOG_FILE")
exec 2> >(tee -a "$LOG_FILE" >&2)
# Exit codes
readonly ERR_API_UNAVAILABLE=15
readonly ERR_INVALID_INPUT=16
# --- Debug environment ---
echo "Date: $(date)"
echo "Python version: $(python --version 2>&1)"
echo "Docling-serve path: $(which docling-serve 2>/dev/null || echo 'Not found')"
echo "Working directory: $(pwd)"
# --- Get input ---
echo "Getting Apify Actor Input"
INPUT=$(apify actor get-input 2>/dev/null)
# --- Setup tools ---
echo "Setting up tools..."
TOOLS_DIR="/tmp/docling-tools"
mkdir -p "$TOOLS_DIR"
# Copy tools if available
if [ -d "/build-files" ]; then
echo "Copying tools from /build-files..."
cp -r /build-files/* "$TOOLS_DIR/"
export PATH="$TOOLS_DIR/bin:$PATH"
else
echo "Warning: No build files directory found. Some tools may be unavailable."
fi
# Check OCR directories and ensure they're writable
echo "Checking OCR directory permissions..."
OCR_DIR="/opt/app-root/src/.EasyOCR"
if [ -d "$OCR_DIR" ]; then
# Test if we can write to the directory
if touch "$OCR_DIR/test_write" 2>/dev/null; then
echo "[✓] OCR directory is writable"
rm "$OCR_DIR/test_write"
else
echo "[✗] OCR directory is not writable, setting up alternative in /tmp"
# Create alternative in /tmp (which is writable)
mkdir -p "/tmp/.EasyOCR/user_network"
export EASYOCR_MODULE_PATH="/tmp/.EasyOCR"
fi
else
echo "OCR directory not found, creating in /tmp"
mkdir -p "/tmp/.EasyOCR/user_network"
export EASYOCR_MODULE_PATH="/tmp/.EasyOCR"
fi
# --- Starting the API ---
echo "Starting docling-serve API..."
# Create a dedicated working directory in /tmp (writable)
API_DIR="/tmp/docling-api"
mkdir -p "$API_DIR"
cd "$API_DIR"
echo "API working directory: $(pwd)"
# Find docling-serve executable
DOCLING_SERVE_PATH=$(which docling-serve)
echo "Docling-serve executable: $DOCLING_SERVE_PATH"
# Start the API with minimal parameters to avoid any issues
echo "Starting docling-serve API..."
"$DOCLING_SERVE_PATH" run --host 0.0.0.0 --port 5001 > "$API_DIR/docling-serve.log" 2>&1 &
API_PID=$!
echo "Started docling-serve API with PID: $API_PID"
# A more reliable wait for API startup
echo "Waiting for API to initialize..."
MAX_TRIES=30
tries=0
started=false
while [ $tries -lt $MAX_TRIES ]; do
tries=$((tries + 1))
# Check if process is still running
if ! ps -p $API_PID > /dev/null; then
echo "ERROR: docling-serve API process terminated unexpectedly after $tries seconds"
break
fi
# Check log for startup completion or errors
if grep -q "Application startup complete" "$API_DIR/docling-serve.log" 2>/dev/null; then
echo "[✓] API startup completed successfully after $tries seconds"
started=true
break
fi
if grep -q "Permission denied\|PermissionError" "$API_DIR/docling-serve.log" 2>/dev/null; then
echo "ERROR: Permission errors detected in API startup"
break
fi
# Sleep and check again
sleep 1
# Output a progress indicator every 5 seconds
if [ $((tries % 5)) -eq 0 ]; then
echo "Still waiting for API startup... ($tries/$MAX_TRIES seconds)"
fi
done
# Show log content regardless of outcome
echo "docling-serve log output so far:"
tail -n 20 "$API_DIR/docling-serve.log"
# Verify the API is running
if ! ps -p $API_PID > /dev/null; then
echo "ERROR: docling-serve API failed to start"
if [ -f "$API_DIR/docling-serve.log" ]; then
echo "Full log output:"
cat "$API_DIR/docling-serve.log"
fi
exit $ERR_API_UNAVAILABLE
fi
if [ "$started" != "true" ]; then
echo "WARNING: API process is running but startup completion was not detected"
echo "Will attempt to continue anyway..."
fi
# Try to verify API is responding at this point
echo "Verifying API responsiveness..."
(python -c "
import sys, time, socket
for i in range(5):
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(1)
result = s.connect_ex(('localhost', 5001))
if result == 0:
s.close()
print('Port 5001 is open and accepting connections')
sys.exit(0)
s.close()
except Exception as e:
pass
time.sleep(1)
print('Could not connect to API port after 5 attempts')
sys.exit(1)
" && echo "API verification succeeded") || echo "API verification failed, but continuing anyway"
# Define API endpoint
DOCLING_API_ENDPOINT="http://localhost:5001/v1alpha/convert/source"
# --- Processing document ---
echo "Starting document processing..."
echo "Reading input from Apify..."
echo "Input content:" >&2
echo "$INPUT" >&2 # Send the raw input to stderr for debugging
echo "$INPUT" # Send the clean JSON to stdout for processing
# Create the request JSON
REQUEST_JSON=$(echo $INPUT | jq '.options += {"return_as_file": true}')
echo "Creating request JSON:" >&2
echo "$REQUEST_JSON" >&2
echo "$REQUEST_JSON" > "$API_DIR/request.json"
# Send the conversion request using our Python script
#echo "Sending conversion request to docling-serve API..."
#python "$TOOLS_DIR/docling_processor.py" \
# --api-endpoint "$DOCLING_API_ENDPOINT" \
# --request-json "$API_DIR/request.json" \
# --output-dir "$API_DIR" \
# --output-format "$OUTPUT_FORMAT"
echo "Curl the Docling API"
curl -s -H "content-type: application/json" -X POST --data-binary @$API_DIR/request.json -o $API_DIR/output.zip $DOCLING_API_ENDPOINT
CURL_EXIT_CODE=$?
# --- Check for various potential output files ---
echo "Checking for output files..."
if [ -f "$API_DIR/output.zip" ]; then
echo "Conversion completed successfully! Output file found."
# Get content from the converted file
OUTPUT_SIZE=$(wc -c < "$API_DIR/output.zip")
echo "Output file found with size: $OUTPUT_SIZE bytes"
# Calculate the access URL for result display
RESULT_URL="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/OUTPUT"
echo "=============================="
echo "PROCESSING COMPLETE!"
echo "Output size: ${OUTPUT_SIZE} bytes"
echo "=============================="
# Set the output content type based on format
CONTENT_TYPE="application/zip"
# Upload the document content using our function
upload_to_kvs "$API_DIR/output.zip" "OUTPUT" "$CONTENT_TYPE" "Document content"
# Only proceed with dataset record if document upload succeeded
if [ $? -eq 0 ]; then
echo "Your document is available at: ${RESULT_URL}"
echo "=============================="
# Push data to dataset
push_to_dataset "$RESULT_URL" "$OUTPUT_SIZE" "zip"
fi
else
echo "ERROR: No converted output file found at $API_DIR/output.zip"
# Create error metadata
ERROR_METADATA="{\"status\":\"error\",\"error\":\"No converted output file found\",\"documentUrl\":\"$DOCUMENT_URL\"}"
echo "$ERROR_METADATA" > "/tmp/actor-output/OUTPUT"
chmod 644 "/tmp/actor-output/OUTPUT"
echo "Error information has been saved to /tmp/actor-output/OUTPUT"
fi
# --- Verify output files for debugging ---
echo "=== Final Output Verification ==="
echo "Files in /tmp/actor-output:"
ls -la /tmp/actor-output/ 2>/dev/null || echo "Cannot list /tmp/actor-output/"
echo "All operations completed. The output should be available in the default key-value store."
echo "Content URL: ${RESULT_URL:-No URL available}"
# --- Cleanup function ---
cleanup() {
echo "Running cleanup..."
# Stop the API process
if [ -n "$API_PID" ]; then
echo "Stopping docling-serve API (PID: $API_PID)..."
kill $API_PID 2>/dev/null || true
fi
# Export log file to KVS if it exists
# DO THIS BEFORE REMOVING TOOLS DIRECTORY
if [ -f "$LOG_FILE" ]; then
if [ -s "$LOG_FILE" ]; then
echo "Log file is not empty, pushing to key-value store (key: LOG)..."
# Upload log using our function
upload_to_kvs "$LOG_FILE" "LOG" "text/plain" "Log file"
else
echo "Warning: log file exists but is empty"
fi
else
echo "Warning: No log file found"
fi
# Clean up temporary files AFTER log is uploaded
echo "Cleaning up temporary files..."
if [ -d "$API_DIR" ]; then
echo "Removing API working directory: $API_DIR"
rm -rf "$API_DIR" 2>/dev/null || echo "Warning: Failed to remove $API_DIR"
fi
if [ -d "$TOOLS_DIR" ]; then
echo "Removing tools directory: $TOOLS_DIR"
rm -rf "$TOOLS_DIR" 2>/dev/null || echo "Warning: Failed to remove $TOOLS_DIR"
fi
# Keep log file until the very end
echo "Script execution completed at $(date)"
echo "Actor execution completed"
}
# Register cleanup
trap cleanup EXIT