/*==============================================================================



                             HTML2XHTML Converter 1.0

                             ========================

                       Copyright (c) 2004 Vyacheslav Smolin





Author:

-------

Vyacheslav Smolin (http://www.richarea.com, http://html2xhtml.richarea.com,

re@richarea.com)



About the script:

-----------------

HTML2XHTML Converter (H2X) generates a well formed XHTML string from a HTML DOM

object.



Requirements:

-------------

H2X works in  MS IE 5.0 for Windows or above,  in Netscape 7.1,  Mozilla 1.3 or

above. It should work in all Mozilla based browsers.



Usage:

------

Please see description of function get_xhtml below.



Demo:

-----

http://html2xhtml.richarea.com/, http://www.richarea.com/demo/



License:

--------

Free for non-commercial using. Please contact author for commercial licenses.





==============================================================================*/





//add \n before opening tag

var need_nl_before = '|div|p|table|tbody|tr|td|th|title|head|body|script|comment|li|meta|h1|h2|h3|h4|h5|h6|hr|ul|ol|option|';

//add \n after opening tag

var need_nl_after = '|html|head|body|p|th|style|';



var re_comment = new RegExp();

re_comment.compile("^<!--(.*)-->$");



var re_hyphen = new RegExp();

re_hyphen.compile("-$");





// Convert inner text of node to xhtml

// Call: get_xhtml(node);

//       get_xhtml(node, lang, encoding) -- to convert whole page

// other parameters are for inner usage and should be omitted

// Parameters:

// node - dom node to convert

// lang - document lang (need it if whole page converted)

// encoding - document charset (need it if whole page converted)

// need_nl - if true, add \n before a tag if it is in list need_nl_before

// inside_pre - if true, do not change content, as it is inside a <pre>

function get_xhtml(node, lang, encoding, need_nl, inside_pre) {

	var i;

	var text = '';

	var children = node.childNodes;

	var child_length = children.length;

	var tag_name;

	var do_nl = need_nl ? true : false;

	var page_mode = true;

	

	for (i = 0; i < child_length; i++) {

		var child = children[i];

		

		switch (child.nodeType) {

			case 1: { //ELEMENT_NODE

				var tag_name = String(child.tagName).toLowerCase();

				

				if (tag_name == '') break;

				

				if (tag_name == 'meta') {

					var meta_name = String(child.name).toLowerCase();

					if (meta_name == 'generator') break;

				}

				

				if (!need_nl && tag_name == 'body') { //html fragment mode

					page_mode = false;

				}

				

				if (tag_name == '!') { //COMMENT_NODE in IE 5.0/5.5

					//get comment inner text

					var parts = re_comment.exec(child.text);

					

					if (parts) {

						//the last char of the comment text must not be a hyphen

						var inner_text = parts[1];

						text += fix_comment(inner_text);

					}

				} else {

					if (tag_name == 'html') {

						text = '<?xml version="1.0" encoding="'+encoding+'"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n';

					}

					

					//inset \n to make code more neat

					if (need_nl_before.indexOf('|'+tag_name+'|') != -1) {

						if ((do_nl || text != '') && !inside_pre) text += '\n';

					} else {

						do_nl = true;

					}

					

					text += '<'+tag_name;

					

					//add attributes

					var attr = child.attributes;

					var attr_length = attr.length;

					var attr_value;

					

					var attr_lang = false;

					var attr_xml_lang = false;

					var attr_xmlns = false;

					

					var is_alt_attr = false;

					

					for (j = 0; j < attr_length; j++) {

						var attr_name = attr[j].nodeName.toLowerCase();

						

						if (!attr[j].specified && 

							(attr_name != 'selected' || !child.selected) && 

							(attr_name != 'style' || child.style.cssText == '') && 

							attr_name != 'value') continue; //IE 5.0

						

						if (attr_name == '_moz_dirty' || 

							attr_name == '_moz_resizing' || 

							tag_name == 'br' && 

							attr_name == 'type' && 

							child.getAttribute('type') == '_moz') continue;

						

						var valid_attr = true;

						

						switch (attr_name) {

							case "style":

								attr_value = child.style.cssText;

								break;

							case "class":

								attr_value = child.className;

								break;

							case "http-equiv":

								attr_value = child.httpEquiv;

								break;

							case "noshade": break; //this set of choices will extend

							case "checked": break;

							case "selected": break;

							case "multiple": break;

							case "nowrap": break;

							case "disabled": break;

								attr_value = attr_name;

								break;

							default:

								try {

									attr_value = child.getAttribute(attr_name, 2);

								} catch (e) {

									valid_attr = false;

								}

								break;

						}

						

						//html tag attribs

						if (attr_name == 'lang') {

							attr_lang = true;

							attr_value = lang;

						}

						if (attr_name == 'xml:lang') {

							attr_xml_lang = true;

							attr_value = lang;

						}

						if (attr_name == 'xmlns') attr_xmlns = true;

						if (valid_attr) {

							//value attribute set to "0" is not handled correctly in Mozilla

							if (!(tag_name == 'li' && attr_name == 'value')) {

								text += ' '+attr_name+'="'+fix_attribute(attr_value)+'"';

							}

						}

						

						if (attr_name == 'alt') is_alt_attr = true;

					}

					

					if (tag_name == 'img' && !is_alt_attr) {

						text += ' alt=""';

					}

					

					if (tag_name == 'html') {

						if (!attr_lang) text += ' lang="'+lang+'"';

						if (!attr_xml_lang) text += ' xml:lang="'+lang+'"';

						if (!attr_xmlns) text += ' xmlns="http://www.w3.org/1999/xhtml"';

					}

					

					if (child.canHaveChildren || child.hasChildNodes()){

						text += '>';

//						if (need_nl_after.indexOf('|'+tag_name+'|') != -1) {

//							text += '\n';

//						}

						text += get_xhtml(child, lang, encoding, true, inside_pre || tag_name == 'pre' ? true : false);

						text += '</'+tag_name+'>';

					} else {

						if (tag_name == 'style' || tag_name == 'title' || tag_name == 'script' || tag_name == 'iframe') {

							text += '>';

							var inner_text;

							if (tag_name == 'script') {

								inner_text = child.text;

							} else {

								inner_text = child.innerHTML;

							}

							

							if (tag_name == 'style') {

								inner_text = String(inner_text).replace(/[\n]+/g,'\n');

							}

							

							text += inner_text+'</'+tag_name+'>';

						} else {

							text += ' />';

						}

					}

				}

				break;

			}

			case 3: { //TEXT_NODE

				if (!inside_pre) { //do not change text inside <pre> tag

					if (child.nodeValue != '\n') {

						text += fix_text(child.nodeValue);

					}

				} else {

					text += child.nodeValue;

				}

				break;

			}

			case 8: { //COMMENT_NODE

				text += fix_comment(child.nodeValue);

				break;

			}

			default:

				break;

		}

	}

	

	if (!need_nl && !page_mode) { //delete head and body tags from html fragment

		text = text.replace(/<\/?head>[\n]*/gi, "");

		text = text.replace(/<head \/>[\n]*/gi, "");

		text = text.replace(/<\/?body>[\n]*/gi, "");

	}

	

	return text;

}



//fix inner text of a comment

function fix_comment(text) {

	//delete double hyphens from the comment text

	text = text.replace(/--/g, "__");

	

	if(re_hyphen.exec(text)) { //last char must not be a hyphen

		text += " ";

	}

	

	return "<!--"+text+"-->";

}



//fix content of a text node

function fix_text(text) {

	//convert <,> and & to the corresponding entities

	return String(text).replace(/\n{2,}/g, "\n").replace(/\&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/\u00A0/g, "&nbsp;");

}



//fix content of attributes href, src or background

function fix_attribute(text) {

	//convert <,>, & and " to the corresponding entities

	return String(text).replace(/\&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/\"/g, "&quot;");

}


