MS-Word-to-HTML document cleaner
This is a very rudimentary script for stripping gack from MS Word documents that have been converted to HTML.
I used preg_replace(), and a series of patterns to remove excess styling from the document.
It’s not perfect by any means, and I make no promises or warranties that it will work for you. This code (which starts and stops between the <pre> and </pre> tags) is however, is free for you to use in any way you wish.
<?
$input=file_get_contents('prog.html');
# i at the end of the pattern indicates a case-insensitive search.
$find=array(
'/mso-[a-z]{1,10}:/',
'/mso-[a-z]{1,10}-[a-z]{1,10}:/',
'/mso-[a-z]{1,10}-[a-z]{1,10}-[a-z]{1,10}:/',
'/<!\[if !supportEmptyParas\]>/',
'/<!\[endif\]>/',
'/<!\[if !supportLists\]>/',
'/l3/',
'/level1/',
'/lfo5/',
'/Mso[a-zA-Z]{1,10}/i',
'/<o:p><\/o:p>/',
'/border:/',
'/border-[a-zA-Z]{3,9}:/',
'/margin:/',
'/margin-[a-zA-Z]{3,9}:/',
'/ class=/',
'/( ){1,3}/',
'/[0-9]{0,3}\.[0-9]{1,2}in/',
'/[0-9]{0,3}\.[0-9]{1,2}pt/',
'/[0-9]{0,3}\.[0-9]{1,2}%/',
"/[0-9]{1,3}pt/",
"/[0-9]{1,3}in/",
"/[0-9]{1,3}%/",
'/solid windowtext/',
'/text-indent:/',
'/tab-stops:[a-zA-Z0-9]{0,8}/',
'/font-[a-zA-Z]{4,7}:/',
'/font:/',
'/width:/',
'/height:/',
'/padding:/',
'/padding-[a-zA-Z]{3,9}:/',
'/color:/',
'/none;/',
'/collapse;/',
'/background:/',
'/Tahoma/',
'/"Times New Roman"/',
"/<span style='.{0,20}'>/",
'/<span style=".{0,20}">/',
'/<\/span>/',
'/<span\s{0,20}style=".{0,10}">/',
"/<span\s{0,20}style='.{0,10}'>/",
#removes colors
'/#[a-zA-Z0-9]{6};/',
#may remove most style attributes
"/style='\p{P}{0,10}'/",
'/style="\p{P}{0,10}"/m',
#cleaning up stray bits
"/style='/",
"/;\s;/",
"/;\s{0,10}'/",
"/\s;\s{0,20}'/",
'/<p><b><u><\/u><\/b><\/p>/'
);
$replace='';
$input = preg_replace($find,$replace,$input);
$input = preg_replace('/\r >/','>',$input);
print $input;
?>
Comments are closed.















