<?php

function web_clean($string) {

	//get rid of escape slashes
	$string = stripslashes($string);

	//if there is a body tag, remove everything before it
	if ($temp = stristr($string,"<body")) $string = $temp;

	//remove any stylesheet or javascript code we may have missed
	$string = eregi_replace("<style[^>]*>[^<]+<\/style[^>]*>/is","",$string);

	$string = eregi_replace("<script[^>]*>[^<]+<\/script[^>]*>/is","",$string);

	//strip all php and html tags
	$string = strip_tags($string);
	
	//remove blank spaces.  All of the other non-alphanum chars have been removed
	$string = eregi_replace("nbsp"," ",$string);

	//remove any remaining tags
	$string = ereg_replace("[?<>]"," ",$string);

	return $string;
}


function string_clean($passString,$filepath,$keepIndex,$preventIndex,$cleanHTML) {

	//strip out non-alphanumeric content
	//we want to keep spaces, html tags, and php tags.  These will be removed
	//as a whole, later.  Only do this with text files
	if ($cleanHTML) $keepIndex .= "<>?";

	//we are pulling directly from a file.  Sed will perform the first run, followed by eregi_replace
	if ($filepath) {

		$passString = `sed -e 's/[^$keepIndex ]//g' "$filepath"`;

	} 

	if ($cleanHTML) $passString = web_clean($passString);

	//this works on either text passed to here, or anything SED may have missed
	$string = ereg_replace("[^$keepIndex ]"," ",$passString);

	//make all lowercase 
	$string = strtolower($string);

	//get rid of any @ or / or . or : at the end or beginning of a word
	$preventIndex[] = ". ";
	$preventIndex[] = ": ";
	$preventIndex[] = "/ ";
	$preventIndex[] = "@ ";
	$preventIndex[] = " .";
	$preventIndex[] = " :";
	$preventIndex[] = " /";
	$preventIndex[] = " @";

	//remove any words in our preventIndex array
	$string = str_replace($preventIndex," ",$string);

	//convert the string to an array with each word having its own key
	$string_array = explode(" ",$string);

	$final_array=array();

	//strip out gibberish and words that are too long or too short
	for ($row=0;$row<count($string_array);$row++) {

		$entry = trim($string_array[$row]);

		//ignore anything we know is gibberish.  This will leave some gibberish,
		//so I need to think of a better way.
		if (($entry 
			&& strlen($entry)<=50 && strlen($entry) >= 3) 
			&& !strstr($entry,"@@")		//these rows keep out more trash if you keep email/web addresses
			&& !strstr($entry,"..")
			&& !strstr($entry,"///")
			&& !strstr($entry,"::")) 	{

				$final_array[]=$entry;

		}

	}

	//remove duplicates and drop empty keys
	$final_array = array_values(array_unique($final_array));

	//rejoin our segments
	$string = implode(" ",$final_array);	

	return $string;

}

/****************************************************************************************

	This function converts a string into an array, then passes it to the word_array
	function.  The returned array of ids is then inserted into the database.  This
	function is called for each page of a file.

*****************************************************************************************/

function word_link($string,$file_id,$page_id,$conn) {

	$string = strtolower($string);

	//split the string into an array.
	$stringArray = explode(" ",$string);

	$returnArray = array();

	//cycle through the array to see if the word exists.

	for ($row=0;$row<count($stringArray);$row++) {

		if (strlen($stringArray[$row]) >= 3) {

			//call our server-side wordindex function.  It basically checks
			//to see if the word is indexed.  If not, it inserts it.  The
			//id of the indexed word is then returned to our array

			$sql = "SELECT wordindex('$stringArray[$row]')";
			
			$query_return = report_query($conn,$sql);
			$query_myrow = single_myrow_result($query_return[0],0);

			$returnArray[] = $query_myrow[0];

		}

	}


	$sql = null;

	//insert all of our file/page/word links into the database
	//we first form one long query statement so we only have to
	//send it to the database once.  
	for ($row=0;$row<count($returnArray);$row++) {

		$word_id = $returnArray[$row];
		$sql .= "INSERT INTO dm_doclink (file_id,word_id,page_id) VALUES ('$file_id','$word_id','$page_id');";

	}

	//break out of this function if it fails
	if (!db_query($conn,$sql)) return false;
	else return true;
	
}

function file_process(	$filepath,
			$filename,
			$pic_convert,
			$delete_files,
			$keepIndex,
			$preventIndex) {

	$filename=stripslashes("$filename");

	$current_date=date("Y-m-d H:i:s");

	$file_extension=file_extension("$filename");

	/* insert it into the object table */

	//this extracts contents from "text" files
	if ($file_extension=="txt") {

 		//clean up the text from the ocr
		$newstring=string_clean(null,$filepath,$keepIndex,$preventIndex,null);

	}

	//this extracts contents from "web" files
	elseif ($file_extension=="web") {

 		//clean up the text.  This one will also strip all web/php tags
		$newstring=string_clean(null,$filepath,$keepIndex,$preventIndex,"yes");

	}

	//follow this path if we have an image, or a tif file that we are NOT converting to pdf

	//for some reason, uploading a tif file is recognized as an application
	//the second part of this statement works around that.

	else if ($file_extension=="image" || $file_extension=="tiff") {

		//ocr image processing
		if (defined("OCR_SUPPORT")) {

			//check to see if this is a tiff image with multiple pages
			if ($file_extension=="tiff") {

				//find out if multipage, use tiffinfo
				$numpages=`tiffinfo "$filepath"`;
				$numpages=substr_count($numpages,"TIFF Directory");

				//if there is more than one tiff image, use the proccessing below

				if ($numpages>1) {

					$multiOCR=1;

					//we will ocr each page seperately, then form one string
					//and index that string

					//split the file and return the names
					$temp_array=tiffSplit($numpages,"$filepath");
	
					//ocr each file and append to content string
	
					for ($row=0;$row<count($temp_array);$row++) {

						$pos = strrpos($temp_array[$row],".");

						//only do this if there is an extension
						if ($pos) $temp = substr($temp_array[$row],0,$pos);
						else $temp = $temp_array[$row];

						$filepathPBM = $temp.".pbm";

						//convert to grayscale, pbm format
						`convert "$temp_array[$row]" "$filepathPBM" 2>&1`;

						//ocr the image and return string as a variable
						$string1.= `gocr "$filepathPBM"`;
	
						//delete the temp file
						@unlink("$temp_array[$row]");
						@unlink("$filepathPBM");

					}

					//clean up the string
					$newstring = string_clean($string1,null,$keepIndex,$preventIndex,$cleanHTML);

				} else {
			
					$multiOCR = null;

				}	


			}

			//use this if a single image, or a single page tiff
			if (!$multiOCR) {

				/* OCR section, the file is converted from orig type to pbm, and
				   then scanned with the ocr software.  The only file type
			 	   limitations are any pic files not supported by imagemagick */

				$pos = strrpos($filepath,".");

				//only do this if the file has an extension
				if ($pos) $temp = substr($filepath,0,$pos);
				else $temp = $filepath;

				$filepath1 = $temp.".pbm";

				//convert to grayscale, pbm format
				`convert "$filepath" "$filepath1" 2>&1`;

				//ocr the image and return string as a variable
				$string1 = `gocr "$filepath1"`;

				//the order of the steps below may need to change, we will see
				$newstring = string_clean($string1,null,$keepIndex,$preventIndex,$cleanHTML);

				//delete the temp file
				@unlink("$filepath1");

			}

		
		}

		//convert the image to a pdf file.  The 1==2 statement
		//prevents this from running, as it's disabled
		if ($pic_convert == "yes" && 1==2) {

			//if this file is not a tiff, convert it to one
			if ($file_extension!="tiff") {

				$temp1 = $filepath;
				$temp2 = $filepath.".tif";

				`convert "$temp1" "$temp2"`;

				//delete the original, keep the tiff version
				@unlink($temp1);

				$filepath = $temp2;
				
			}

			//tiff2ps the original tiff file
			$filepathPS = $filepath.".ps";
			`tiff2ps -a2 "$filepath" > "$filepathPS"`;

			$filepathPDF = $filepathPS.".pdf";

			`ps2pdf "$filepathPS" "$filepathPDF"`;

			//delete the original tiff and the ps file, keep the pdf
			@unlink("$filepathPS");
			if ($delete_files=="yes") @unlink("$filepath");

			$file2move=$filepathPDF;

			//change the name to pdf if we're converting
			$pos = strrpos($filename,".");
			$filename = substr($filename,0,$pos);
			$filename = $filename.".pdf";

		}
	}

	else if ($file_extension=="pdf" && defined("INDEX_PDF")) {

		//first we have to figure out how many pages
		//are in the file.  this is a rough method.
		//we have gs kick up an error after in opens
		//the file and sees how many pages there are

		$numpages = `gs "$filepath"`;
		$pos1 = strpos($numpages,"through");
		$numpages = substr($numpages,$pos1);
		$pos2 = strpos($numpages,".");
		$numpages= trim(substr($numpages,8,$pos2-8));

		$filetext=array();

		for ($row=1;$row<=$numpages;$row++) {

			//gs the page and return as a string
			$tempstring=`gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -dFirstPage=$row -dLastPage=$row -c save -f ps2ascii.ps "$filepath" -c quit`; 

			//strip out all the trash from the string
			$tempstring = string_clean($tempstring,null,$keepIndex,$preventIndex,null);

			//append to the array
			$filetext[]=$tempstring;
		} 

		$multipage=1;
		$newstring=null;

	}
	else {
		$newstring=null;
		$multipage=null;
	}

	if (!$file2move) $file2move = addslashes($filepath);

	return array($newstring,$multipage,$file2move,$filetext,$filename);

}


function property_update($conn,$valueArray) {

	$view_edit_array = $valueArray["view_edit_array"];
	$view_array = $valueArray["view_array"];
	$object_id = $valueArray["object_id"];
	$newstring = $valueArray["newstring"];
	$filetext = $valueArray["filetext"];
	$db_action = $valueArray["db_action"];
	$multipage = $valueArray["multipage"];
	$cat_grouping = $valueArray["cat_grouping"];

	/* Now insert permissions into the permissions table */
	/* First we find out what groups we are setting permissions for */

	//set all of our insert checks
	$view_edit_insert = 1;
	$view_insert = 1;
	$content_insert = 1;
	$cat_insert = 1;
	
	//right now we have ids of groups.  We need to convert them to object ids
	//get the auth_objectids of all groups
	$sql = "SELECT * FROM auth_groups";

	$query_return=report_query($conn,$sql);

	//we have groups, so go ahead
	if ($query_return[1]!=0) {

		$id_array=array_result("auth_objectid",$query_return[0],$query_return[1]);
		$group_array=array_result("id",$query_return[0],$query_return[1]);

		//now, loop through view and view/edit and turn into our auth_object
		for ($row=0;$row<count($view_edit_array);$row++) {

			if (in_array($view_edit_array[$row],$group_array)) {
				$key = array_search($view_edit_array[$row],$group_array);
				$view_edit_array[$row] = $id_array[$key];
			}
		}

		for ($row=0;$row<count($view_array);$row++) {

			if (in_array($view_array[$row],$group_array)) {
				$key = array_search($view_array[$row],$group_array);
				$view_array[$row] = $id_array[$key];
			}

		}

		if ($db_action=="update") {

			//if we're updating, delete the existing permissions and category entries
			$sql = "DELETE FROM dm_file_permissions WHERE object_id='$object_id';";
			$sql .= "DELETE FROM dm_file_cat WHERE file_id='$object_id';";

			db_query($conn,$sql);

		}

		for ($row=0;$row<count($view_edit_array);$row++) {

			if ($view_edit_array[$row]) {
				$sql = "INSERT INTO dm_file_permissions
					(object_id,auth_objectid,bitset)
					VALUES
					('$object_id','$view_edit_array[$row]','3')";
		
				if (!db_query($conn,$sql)) {
					$view_edit_insert=null;
					break;
				}
			}
		}

		for ($row=0;$row<count($view_array);$row++) {

			if ($view_array[$row]) {
				$sql = "INSERT INTO dm_file_permissions
				(object_id,auth_objectid,bitset)
				VALUES
				('$object_id','$view_array[$row]','2')";

				if (!db_query($conn,$sql)) {
					$view_insert=null;
					break;
				}
			}
		}

	
	//end the user_group if statement
	}

	//if we're not updating, index the content

	if ($db_action != "update") {

		/* now, if the item is multipage, we insert the indexed search info */

		if ($multipage==1) {

			for ($row=0;$row<count($filetext);$row++) {
		
				$page=$row+1;
		
				if ($filetext[$row] && !word_link($filetext[$row],$object_id,$page,$conn)) {

					$content_insert=null;
					break;
				}	
			}
	
		}
		//insert contents for single page docs
		else {

			$page = 1;

			if ($newstring && !word_link($newstring,$object_id,$page,$conn)) $content_insert=null;

		}
	}

	//point the file to the unassigned category no category is selected
	if ($cat_grouping==null) $cat_grouping[0] = 1;
			
	for ($row=0;$row<count($cat_grouping);$row++) {

		if ($cat_grouping[$row]) {
			$sql = "INSERT INTO dm_file_cat
				(file_id,cat_id)
				VALUES	
				('$object_id','$cat_grouping[$row]')";

			if (!db_query($conn,$sql)) {
				$cat_insert=null;
				break;
			}
		}
	}

	return array(	"view_edit_insert" => $view_edit_insert,
				"view_insert" => $view_insert,
				"content_insert" => $content_insert,
				"cat_insert" => $cat_insert
				);
			
	
}


function file_insert($conn,$valueArray) {

	$filepath = $valueArray["filepath"];
	$filename = $valueArray["filename"];
	$db_action = $valueArray["db_action"];
	$accountid = $valueArray["accountid"];
	$current_date = $valueArray["current_date"];	
	$cat_grouping = $valueArray["cat_grouping"];
	$summary_value = $valueArray["summary_value"];
	$version_value = $valueArray["version_value"];
	$delete_files = $valueArray["delete_files"];


	//get the file size
	if ($file2move) $file_size = filesize($file2move); 
	else $file_size = filesize("$filepath");


	//determine the filename if this is an import
	if (!$filename) {

		if ($file2move) {
			$pos=strrpos($file2move,"/")+1;
			$filename=substr($file2move,$pos);

		} else {
			$pos=strrpos($filepath,"/")+1;
			$filename=substr(addslashes($filepath),$pos);
		}
	}

	$filename = addslashes($filename);

	$savename = $filename;

	//index the file and return the info
	$property_array=file_process(	$filepath,
					$filename,
					$valueArray["pic_convert"],
					$valueArray["delete_files"],
					$valueArray["keepIndex"],
					$valueArray["preventIndex"]
					);

	$newstring=$property_array[0];
	$multipage=$property_array[1];
	$file2move=$property_array[2];
	$filetext=$property_array[3];
	$filename = $property_array[4];


	//Begin the transaction
	beginTransaction($conn);

	//for some reason we keep getting tabs in our summary, this should get rid of those
	$summary_value=eregi_replace("	","",$summary_value);

	if (!$version_value) $version_value = 1;
	if (!$db_action) $db_action = "insert";

	if ($db_action=="update") {

		$sql = "UPDATE dm_object SET
			name='$filename',
			version='$version_value',
			status='0',
			summary='$summary_value'
			WHERE id='$object_id'";

	}
	else {

		$sql = "INSERT INTO dm_object
			(name,version,summary,status,status_owner,file_owner,create_date,status_date)
			VALUES
			('$filename','$version_value','$summary_value','0','$accountid','$accountid','$current_date','$current_date')";
		
	}


	if ($result=db_query($conn,$sql)) {

		$object_insert="yes";

		$object_id=db_insert_id("dm_object","id",$conn,$result);

		//Insert it into the file_history table

		$sql = "INSERT INTO dm_file_history
				(object_id,size,version,modify,owner_id) 
				VALUES
				('$object_id','$file_size','$version_value','$current_date','$accountid')";

		if ($result=db_query($conn,$sql)) {

			$history_insert="yes";

			$file_id=db_insert_id("dm_file_history","id",$conn,$result);
	
		}


		if ($db_action == "insert") {

			$optionArray = array( 	"view_edit_array" => $valueArray["view_edit_array"],
						"view_array" => $valueArray["view_array"],
						"object_id" => $object_id,
						"newstring" => $newstring,
						"filetext" => $filetext,
						"db_action" => $db_action,
						"multipage" => $multipage,
						"cat_grouping" => $cat_grouping
						);	

			$returnArray = property_update($conn,$optionArray);

			//get our values for the update statement
			$view_edit_insert = $returnArray["view_edit_insert"];
			$view_insert = $returnArray["view_insert"];
			$cat_insert = $returnArray["cat_insert"];
			$content_insert = $returnArray["content_insert"];

		}
		else {
			//set these to yes for the update statement
			$view_edit_insert = 1;
			$view_insert = 1;
			$cat_insert = 1;
			$content_insert = 1;
		}

	//end object insert if statement
	}

	endTransaction($conn);

	//only if the above worked out do we copy the file
	if ($object_insert 
		&& $history_insert 
		&& $view_edit_insert 
		&& $view_insert 
		&& $cat_insert
		&& $content_insert) {

		$error_array = null;

		$upload_filename = $file_id.".docmgr";

		//this strips slashes from a filename so the function can handle it
		$file2move=stripslashes($file2move);

		//copy the file to its final place	
		copy("$file2move", "data/".$upload_filename);

		//fix the permissions
		@chmod("data/".$upload_filename,"0384");

		//see if this is a tiff converted to a pdf
		$extension = file_extension("$savename");

		//do we delete the files we imported?
		if ($delete_files=="yes" || ($extension=="tiff" && $valueArray["pic_convert"]=="yes")) {
			@unlink("$file2move");
		}

		//return our object id for logging
		return $object_id;
	}
	else {	

		$error_array = array();
		if (!$object_insert) $error_array[]	=	"object"; 
		if (!$history_insert) $error_array[]	=	"history"; 
		if (!$view_edit_insert) $error_array[]	=	"view_edit"; 
		if (!$view_insert) $error_array[]	=	"view"; 
		if (!$content_insert) $error_array[]	=	"content";
		if (!$cat_insert) $error_array[]	=	"category";

		//delete the object just in case
		if ($object_id) deleteObject($object_id);

		//return the error array for troubleshooting
		return $error_array;
	}

}
