Chuyển file Doc thành PDF, txt or HTML dùng PHP và Linux

This has been an issue that has bothered me for a while. I finally found a solution that worked and doesn’t kill your server in the process.

I give to two words. OpenOffice, or is that one ?

This is what I’m running for this test:

OS: CentOS release 5.2 (Final)
PHP: PHP 5.2.8
Openoffice 1.2.3

Firstly I installed several programs using yum. You will need to use DAG’s repo:

rpm -Uhv http://apt.sw.be/redhat/el5/en/x86_64/rpmforge/RPMS//rpmforge-release-0.3.6-1.el5.rf.x86_64.rpm

yum install unoconv openoffice.org-headless openoffice.org-writer

unoconv is a handy tool that can be run as a demon and talk to the open office binary, via the command line.

In order to run the commands via apache you need to change the apache home directory and make it writable.

mkdir /home/apache
chown apache:apache /home/apache
usermod -d /home/apache apache
chmd 755 /home/apache

Now the apache user can create the hidden .openoffice.org2.0 directory.

With the setup out of the wa,y we need to start the open office deamon.

I did this as root but you could start this as apache.

unoconv –listener &

This basically creates the following deamon

soffice.bin -nologo -nodefault -accept=socket,host=localhost,port=2002;urp;StarOffice.ComponentContext

You can now send requests to port 2002 using unoconv

/usr/bin/unoconv --server localhost --port 2002 --stdout -f pdf input.doc

This will output the PDF file to the stdout.

Here is a cakephp component that I wrote to talk to unoconv. Please note this is very alpha and has only had a small amount of testing but works  If you want to use it you must create these directories in your cake install.

‘TMP_FOLDER’, TMP . ‘filegenerator/’
ROOT . ‘/uploads/generatedpdfs/’
ROOT . ‘/uploads/docfiles/’

It can be used via a form upload

$this->Filegenerator = new FilegeneratorComponent ($this->params["form"]['uploaddocfile']);
// if the filegenerator did all it's magic ok then process
if($this->Filegenerator){

// returns the text version of the PDF
$text = $this->Filegenerator->convertDocToTxt();
// returns the html of the PDF
$html = $this->Filegenerator->convertDocToHtml();
// returns the generated pdf file
$pdf = $this->Filegenerator->convertDocToPdf($doc_id);

}

The component called filegenerator.php

<?php
/**
* Class Used to convert files.
*@author jamiescott.net
*/
class FilegeneratorComponent extends Object {

// input folder types
private $allowable_files = array ('application/msword' => 'doc' );
// variable set if the constuctor loaded correctly.
private $pass = false;
// store the file info from constuctor reference
private $fileinfo;

/**
* Enter description here...
*
* @param array $fileinfo
* Expected :
* (
[name] => test.doc
[type] => application/msword
[tmp_name] => /Applications/MAMP/tmp/php/php09PYNO
[error] => 0
[size] => 79360
)
*
*
* @return unknown
*/
function __construct($fileinfo) {

// folder to process all the files etc
define ( 'TMP_FOLDER', TMP . 'filegenerator/' . $this->generatefoldername () . '/' );

// where unoconv is installed
define ( 'UNOCONV_PATH', '/usr/bin/unoconv' );
// where to store pdf files
define ( 'PDFSTORE', ROOT . '/uploads/generatedpdfs/' );
// where to store doc files
define ( 'DOCSTORE', ROOT . '/uploads/docfiles/' );
// apache home dir
define ( 'APACHEHOME', '/home/apache' );
// set some shell enviroment vars
putenv ( "HOME=".APACHEHOME );
putenv ( "PWD=".APACHEHOME );

// check the file info is passed the tmp file is there and the correct file type is set
// and the tmp folder could be created
if (is_array ( $fileinfo ) &amp;amp;&amp;amp; file_exists ( $fileinfo ['tmp_name'] ) &amp;amp;&amp;amp; in_array ( $fileinfo ['type'], array_keys ( $this->allowable_files ) ) &amp;amp;&amp;amp; $this->createtmp ()) {

// bass by reference
$this->fileinfo = &amp;amp;$fileinfo;
// the constuctor ran ok
$this->pass = true;
// return true to the instantiation
return true;

} else {
// faild to instantiate
return false;

}

}

/**
*      * takes the file set in the constuctor and turns it into a pdf
* stores it in /uploads/docfiles and returns the filename
*
* @return filename if pdf was generated
*/
function convertDocToPdf($foldername=false) {

if ($this->pass) {

// generate a random name
$output_pdf_name = $this->generatefoldername () . '.pdf';

// move it to the tmp folder for processing
if (! copy ( $this->fileinfo ['tmp_name'], TMP_FOLDER . 'input.doc' ))
die ( 'Error copying the doc file' );

$command = UNOCONV_PATH;
$args = ' --server localhost --port 2002 --stdout -f pdf ' . TMP_FOLDER . 'input.doc';

$run = $command . $args;

//echo $run; die;
$pdf = shell_exec ( $run );
$end_of_line = strpos ( $pdf, "\n" );
$start_of_file = substr ( $pdf, 0, $end_of_line );

if (! eregi ( '%PDF', $start_of_file ))
die ( 'Error Generating the PDF file' );

if(!file_exists(PDFSTORE.$foldername)){
mkdir(PDFSTORE.$foldername);
}

// file saved
if(!$this->_createandsave($pdf, PDFSTORE.'/'.$foldername.'/', $output_pdf_name)){
die('Error Saving The PDF');
}

return $output_pdf_name;

}

}

/**
* Return a text version of the Doc
*
* @return unknown
*/
function convertDocToTxt() {

if ($this->pass) {

// move it to the tmp folder for processing
if (! copy ( $this->fileinfo ['tmp_name'], TMP_FOLDER . 'input.doc' ))
die ( 'Error copying the doc file' );

$command = UNOCONV_PATH;
$args = ' --server localhost --port 2002 --stdout -f txt ' . TMP_FOLDER . 'input.doc';

$run = $command . $args;

//echo $run; die;
$txt = shell_exec ( $run );

// guess that if there is less than this characters probably an error
if (strlen($txt) < 10)
die ( 'Error Generating the TXT' );

// return the txt from the PDF
return $txt;

}

}

/**
* Convert the do to heml and return the html
*
* @return unknown
*/
function convertDocToHtml() {

if ($this->pass) {

// move it to the tmp folder for processing
if (! copy ( $this->fileinfo ['tmp_name'], TMP_FOLDER . 'input.doc' ))
die ( 'Error copying the doc file' );

$command = UNOCONV_PATH;
$args = ' --server localhost --port 2002 --stdout -f html ' . TMP_FOLDER . 'input.doc';

$run = $command . $args;

//echo $run; die;
$html= shell_exec ( $run );
$end_of_line = strpos ( $html, "\n" );
$start_of_file = substr ( $html, 0, $end_of_line );

if (! eregi ( 'HTML', $start_of_file ))
die ( 'Error Generating the HTML' );

// return the txt from the PDF
return $html;

}

}
/**
* Create file and store data
*
* @param unknown_type $data
* @param unknown_type $location
* @return unknown
*/
function _createandsave($data, $location, $file) {

if (is_writable ( $location )) {

// In our example we're opening $filename in append mode.
// The file pointer is at the bottom of the file hence
// that's where $somecontent will go when we fwrite() it.
if (! $handle = fopen ( $location.$file, 'w' )) {
trigger_error("Cannot open file ($location$file)");
return false;
}

// Write $somecontent to our opened file.
if (fwrite ( $handle, $data ) === FALSE) {
trigger_error("Cannot write to file ($location$file)");
return false;
}

fclose ( $handle );
return true;

} else {
trigger_error("The file $location.$file is not writable");
return false;
}

}

function __destruct() {

// remove the tmp folder

if (file_exists ( TMP_FOLDER ) &amp;amp;&amp;amp; strlen ( TMP_FOLDER ) > 4)
$this->removetmp ();

}

/**
* Create the tmp directory to hold and process the files
*
* @return unknown
*/
function createtmp() {

if (is_writable ( TMP )) {

if (mkdir ( TMP_FOLDER ))
return true;

} else {

return false;
}

return false;

}

/**
* Delete the tmp dir
*
* @return unknown
*/
function removetmp() {

if (strlen ( TMP_FOLDER ) > 3 &amp;amp;&amp;amp; file_exists ( TMP_FOLDER )) {

if ($this->recursive_remove_directory ( TMP_FOLDER ))
return true;

}

return false;
}

/**
* Return a rendom string for the folder name
*
* @return unknown
*/
function generatefoldername() {

return md5 ( microtime () );

}

/**
* Recursivly delete directroy or empty it
*
* @param unknown_type $directory
* @param unknown_type $empty
* @return unknown
*/
function recursive_remove_directory($directory, $empty = FALSE) {
// if the path has a slash at the end we remove it here
if (substr ( $directory, - 1 ) == '/') {
$directory = substr ( $directory, 0, - 1 );
}

// if the path is not valid or is not a directory ...
if (! file_exists ( $directory ) || ! is_dir ( $directory )) {
// ... we return false and exit the function
return FALSE;

// ... if the path is not readable
} elseif (! is_readable ( $directory )) {
// ... we return false and exit the function
return FALSE;

// ... else if the path is readable
} else {

// we open the directory
$handle = opendir ( $directory );

// and scan through the items inside
while ( FALSE !== ($item = readdir ( $handle )) ) {
// if the filepointer is not the current directory
// or the parent directory
if ($item != '.' &amp;amp;&amp;amp; $item != '..') {
// we build the new path to delete
$path = $directory . '/' . $item;

// if the new path is a directory
if (is_dir ( $path )) {
// we call this function with the new path
recursive_remove_directory ( $path );

// if the new path is a file
} else {
// we remove the file
unlink ( $path );
}
}
}
// close the directory
closedir ( $handle );

// if the option to empty is not set to true
if ($empty == FALSE) {
// try to delete the now empty directory
if (! rmdir ( $directory )) {
// return false if not possible
return FALSE;
}
}
// return success
return TRUE;
}
}
}
?>