<?php

defined('BASEPATH') or exit('No direct script access allowed');

/**
 * AI File Parser Helper
 * 
 * Handles parsing and content extraction from various file types
 * for AI lead analysis
 * 
 * @package    Application
 * @subpackage Helpers
 * @category   AI Analysis
 * @author     Ibraya Group
 * @since      Version 1.0.0
 */

if (!function_exists('parse_file_for_ai_analysis')) {
    /**
     * Parse uploaded file and extract content for AI analysis
     * 
     * @param string $file_path Full path to the uploaded file
     * @param string $file_type File MIME type or extension
     * @return array Array with extracted content and metadata
     */
    function parse_file_for_ai_analysis($file_path, $file_type = '')
    {
        $result = [
            'success' => false,
            'content' => '',
            'metadata' => [],
            'error' => ''
        ];

        if (!file_exists($file_path)) {
            $result['error'] = 'File not found: ' . $file_path;
            return $result;
        }

        // Get file extension if type not provided
        if (empty($file_type)) {
            $file_type = strtolower(pathinfo($file_path, PATHINFO_EXTENSION));
        }

        try {
            switch ($file_type) {
                case 'txt':
                case 'text/plain':
                    $result = parse_text_file($file_path);
                    break;

                case 'pdf':
                case 'application/pdf':
                    $result = parse_pdf_file($file_path);
                    break;

                case 'doc':
                case 'docx':
                case 'application/msword':
                case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
                    $result = parse_word_document($file_path);
                    break;

                case 'xls':
                case 'xlsx':
                case 'application/vnd.ms-excel':
                case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
                    $result = parse_excel_file($file_path);
                    break;

                case 'jpg':
                case 'jpeg':
                case 'png':
                case 'gif':
                case 'bmp':
                case 'image/jpeg':
                case 'image/png':
                case 'image/gif':
                case 'image/bmp':
                    $result = parse_image_file($file_path);
                    break;

                default:
                    $result['error'] = 'Unsupported file type: ' . $file_type;
                    break;
            }
        } catch (Exception $e) {
            $result['error'] = 'Error parsing file: ' . $e->getMessage();
        }

        return $result;
    }
}

if (!function_exists('parse_text_file')) {
    /**
     * Parse plain text file
     * 
     * @param string $file_path
     * @return array
     */
    function parse_text_file($file_path)
    {
        $content = file_get_contents($file_path);
        
        return [
            'success' => true,
            'content' => $content,
            'metadata' => [
                'type' => 'text',
                'word_count' => str_word_count($content),
                'char_count' => strlen($content)
            ],
            'error' => ''
        ];
    }
}

if (!function_exists('parse_pdf_file')) {
    /**
     * Parse PDF file using various methods
     * 
     * @param string $file_path
     * @return array
     */
    function parse_pdf_file($file_path)
    {
        $result = [
            'success' => false,
            'content' => '',
            'metadata' => ['type' => 'pdf'],
            'error' => ''
        ];

        // Method 1: Try using pdftotext (if available)
        if (function_exists('shell_exec')) {
            $output = shell_exec("pdftotext \"$file_path\" -");
            if (!empty($output)) {
                $result['success'] = true;
                $result['content'] = $output;
                $result['metadata']['extraction_method'] = 'pdftotext';
                return $result;
            }
        }

        // Method 2: Try using PDF parser library (if available)
        if (class_exists('Smalot\PdfParser\Parser')) {
            try {
                $parser = new \Smalot\PdfParser\Parser();
                $pdf = $parser->parseFile($file_path);
                $text = $pdf->getText();
                
                $result['success'] = true;
                $result['content'] = $text;
                $result['metadata']['extraction_method'] = 'pdf_parser';
                return $result;
            } catch (Exception $e) {
                // Continue to next method
            }
        }

        // Method 3: Basic PDF content extraction (limited)
        $content = file_get_contents($file_path);
        if (preg_match_all('/\(([^)]+)\)/', $content, $matches)) {
            $text = implode(' ', $matches[1]);
            if (!empty($text)) {
                $result['success'] = true;
                $result['content'] = $text;
                $result['metadata']['extraction_method'] = 'regex';
                return $result;
            }
        }

        $result['error'] = 'Unable to extract text from PDF file. Consider installing pdftotext or PDF parser library.';
        return $result;
    }
}

if (!function_exists('parse_word_document')) {
    /**
     * Parse Word document (.doc/.docx)
     * 
     * @param string $file_path
     * @return array
     */
    function parse_word_document($file_path)
    {
        $result = [
            'success' => false,
            'content' => '',
            'metadata' => ['type' => 'word'],
            'error' => ''
        ];

        $extension = strtolower(pathinfo($file_path, PATHINFO_EXTENSION));

        if ($extension === 'docx') {
            // Parse DOCX using ZIP and XML parsing
            try {
                $zip = new ZipArchive();
                if ($zip->open($file_path) === TRUE) {
                    $xml = $zip->getFromName('word/document.xml');
                    $zip->close();
                    
                    if ($xml) {
                        // Extract text from XML
                        $dom = new DOMDocument();
                        $dom->loadXML($xml);
                        $text = $dom->textContent;
                        
                        $result['success'] = true;
                        $result['content'] = $text;
                        $result['metadata']['extraction_method'] = 'xml_parsing';
                        return $result;
                    }
                }
            } catch (Exception $e) {
                $result['error'] = 'Error parsing DOCX: ' . $e->getMessage();
                return $result;
            }
        }

        // Try using antiword for .doc files (if available)
        if ($extension === 'doc' && function_exists('shell_exec')) {
            $output = shell_exec("antiword \"$file_path\"");
            if (!empty($output)) {
                $result['success'] = true;
                $result['content'] = $output;
                $result['metadata']['extraction_method'] = 'antiword';
                return $result;
            }
        }

        $result['error'] = 'Unable to parse Word document. Make sure required libraries are installed.';
        return $result;
    }
}

if (!function_exists('parse_excel_file')) {
    /**
     * Parse Excel file (.xls/.xlsx)
     * 
     * @param string $file_path
     * @return array
     */
    function parse_excel_file($file_path)
    {
        $result = [
            'success' => false,
            'content' => '',
            'metadata' => ['type' => 'excel'],
            'error' => ''
        ];

        // Try using PhpSpreadsheet (if available)
        if (class_exists('PhpOffice\PhpSpreadsheet\IOFactory')) {
            try {
                $spreadsheet = \PhpOffice\PhpSpreadsheet\IOFactory::load($file_path);
                $worksheet = $spreadsheet->getActiveSheet();
                
                $content = '';
                $row_count = 0;
                $col_count = 0;
                
                foreach ($worksheet->getRowIterator() as $row) {
                    $row_count++;
                    $cellIterator = $row->getCellIterator();
                    $cellIterator->setIterateOnlyExistingCells(FALSE);
                    
                    $row_data = [];
                    foreach ($cellIterator as $cell) {
                        $value = $cell->getCalculatedValue();
                        if (!empty($value)) {
                            $row_data[] = $value;
                            $col_count = max($col_count, count($row_data));
                        }
                    }
                    
                    if (!empty($row_data)) {
                        $content .= implode("\t", $row_data) . "\n";
                    }
                }
                
                $result['success'] = true;
                $result['content'] = $content;
                $result['metadata']['extraction_method'] = 'phpspreadsheet';
                $result['metadata']['rows'] = $row_count;
                $result['metadata']['columns'] = $col_count;
                return $result;
            } catch (Exception $e) {
                $result['error'] = 'Error parsing Excel file: ' . $e->getMessage();
                return $result;
            }
        }

        $result['error'] = 'Unable to parse Excel file. PhpSpreadsheet library not available.';
        return $result;
    }
}

if (!function_exists('parse_image_file')) {
    /**
     * Parse image file using OCR or image analysis
     * 
     * @param string $file_path
     * @return array
     */
    function parse_image_file($file_path)
    {
        $result = [
            'success' => false,
            'content' => '',
            'metadata' => ['type' => 'image'],
            'error' => ''
        ];

        // Get image information
        $image_info = getimagesize($file_path);
        if ($image_info) {
            $result['metadata']['width'] = $image_info[0];
            $result['metadata']['height'] = $image_info[1];
            $result['metadata']['mime_type'] = $image_info['mime'];
        }

        // Try OCR using Tesseract (if available)
        if (function_exists('shell_exec')) {
            $output = shell_exec("tesseract \"$file_path\" stdout 2>/dev/null");
            if (!empty($output) && trim($output) !== '') {
                $result['success'] = true;
                $result['content'] = trim($output);
                $result['metadata']['extraction_method'] = 'tesseract_ocr';
                return $result;
            }
        }

        // For now, just provide basic image metadata
        $result['success'] = true;
        $result['content'] = 'Image file uploaded: ' . basename($file_path);
        $result['metadata']['extraction_method'] = 'metadata_only';
        $result['metadata']['note'] = 'OCR not available. Install Tesseract for text extraction from images.';
        
        return $result;
    }
}

if (!function_exists('get_file_content_summary')) {
    /**
     * Create a summary of extracted file content for AI prompt
     * 
     * @param array $files Array of parsed file results
     * @return string Formatted content summary
     */
    function get_file_content_summary($files)
    {
        if (empty($files)) {
            return '';
        }

        $summary = "\n\n=== UPLOADED FILES ANALYSIS ===\n";
        
        foreach ($files as $filename => $file_data) {
            if (!$file_data['success']) {
                $summary .= "File: {$filename} - Error: {$file_data['error']}\n";
                continue;
            }

            $summary .= "\nFile: {$filename}\n";
            $summary .= "Type: {$file_data['metadata']['type']}\n";
            
            if (!empty($file_data['metadata']['extraction_method'])) {
                $summary .= "Extraction: {$file_data['metadata']['extraction_method']}\n";
            }
            
            $content = trim($file_data['content']);
            if (!empty($content)) {
                // Limit content length for AI prompt
                if (strlen($content) > 2000) {
                    $content = substr($content, 0, 2000) . '... [truncated]';
                }
                $summary .= "Content:\n{$content}\n";
            }
            
            $summary .= "---\n";
        }
        
        return $summary;
    }
}
