FileDocCategorySizeDatePackage
HTMLUtils.javaAPI DocAzureus 3.0.3.48539Mon Jul 02 14:00:24 BST 2007org.gudy.azureus2.core3.html

HTMLUtils

public class HTMLUtils extends Object
author
parg

Fields Summary
Constructors Summary
Methods Summary
public static java.util.ListconvertHTMLToText(java.lang.String indent, java.lang.String text)
returns a list of strings for each line in a basic text representation

param
indent
param
text
return

		int		pos = 0;
		
		int orderedIndex = 0;
		
		text = text.replaceAll("<ol>","");
		text = text.replaceAll("</ol>","");
		text = text.replaceAll("<ul>","");
		text = text.replaceAll("</ul>","");
		text = text.replaceAll("</li>","");
		text = text.replaceAll("<li>","\n\t*");
		
		String lc_text = text.toLowerCase();
		
		List	lines = new ArrayList();
		
		while( true ){
			
			String	line;
			
			String[]	tokens = new String[]{ "<br>", "<p>" };
			
			String	token 	= null;
			int		p1		= -1;
			
			for (int i=0;i<tokens.length;i++){
				
				int	x = lc_text.indexOf( tokens[i], pos );
				
				if ( x != -1 ){
					if ( p1 == -1 || x < p1 ){
						token	= tokens[i];
						p1		= x;
					}
				}
			}

			if ( p1 == -1 ){
				
				line = text.substring(pos);
				
			}else{
				
				line = text.substring(pos,p1);
				
				pos = p1+token.length();
			}
			
			lines.add( indent + line );
			
			if ( p1 == -1 ){
				
				break;
			}
		}
		
		return( lines );
	
public static java.lang.StringconvertHTMLToText2(java.lang.String content)

		int	pos	= 0;

		String	res = "";
			
		content = removeTagPairs( content, "script" );
		
		content = content.replaceAll( " ", " " );

		content = content.replaceAll( "[\\s]+", " " );

		while(true){
			
			int	p1 = content.indexOf( "<",  pos );
			
			if ( p1 == -1 ){
				
				res += content.substring(pos);
				
				break;
			}
			
			int	p2 = content.indexOf( ">", p1 );
			
			if ( p2 == -1 ){
				
				res += content.substring(pos);
				
				break;
			}

			String	tag = content.substring(p1+1,p2).toLowerCase();
				
			res += content.substring(pos,p1);
			
			if ( tag.equals("p") || tag.equals("br")){
				
				if ( res.length() > 0 && res.charAt(res.length()-1) != '\n" ){
				
					res += "\n";
				}
			}
		
			pos	= p2+1;
		}
		
		res = res.replaceAll( "[ \\t\\x0B\\f\\r]+", " " );
		res = res.replaceAll( "[ \\t\\x0B\\f\\r]+\\n", "\n" );
		res = res.replaceAll( "\\n[ \\t\\x0B\\f\\r]+", "\n" );
		
		if ( res.length() > 0 && Character.isWhitespace(res.charAt(0))){
			
			res = res.substring(1);
		}
		
		return( res );
	
public static java.lang.StringconvertListToString(java.util.List list)

	  
	  StringBuffer result = new StringBuffer();
	  String separator = "";
	  Iterator iter = list.iterator();
	  while(iter.hasNext()) {
	    String line = iter.next().toString();
	    result.append(separator);
	    result.append(line);
	    separator = "\n";
	  }
	  
	  return result.toString();
	
public static java.lang.Stringexpand(java.lang.String str)

		str = XUXmlWriter.unescapeXML( str );
		
		str = str.replaceAll( " ", " " );
		
		return( str );
	
public static java.lang.Object[]getLinks(java.lang.String content_in)

		int	pos	= 0;

		List	urls = new ArrayList();
		
		String	content_out = "";
		
		String	current_url				= null;
		int		current_url_start		= -1;
		
		while(true){
			
			int	p1 = content_in.indexOf( "<", pos );
			
			if ( p1 == -1 ){
				
				break;
			}
			
			p1++;
			
			int	p2 = content_in.indexOf( ">", p1 );
			
			if ( p2 == -1 ){
				
				break;
			}
			
			if ( p1 > pos ){
				
				content_out += content_in.substring( pos, p1-1 );
			}
			
			int	old_pos = pos;
			
			pos	= p2+1;
			
			String	tag 	= content_in.substring( p1, p2 ).trim();
			
			String	lc_tag 	= tag.toLowerCase();
						
			if ( lc_tag.startsWith("a " )){
				
				int	hr_start = lc_tag.indexOf( "href");
				
				if ( hr_start == -1 ){
					
					continue;
				}
				
				hr_start = lc_tag.indexOf("=", hr_start);
				
				if ( hr_start == -1 ){
					
					continue;
				}
				
				hr_start += 1;
				
				while( 	hr_start < lc_tag.length() &&
						Character.isWhitespace(lc_tag.charAt(hr_start))){
					
					hr_start++;
				}
				
				int hr_end = lc_tag.length()-1;
				
				while(	hr_end >= lc_tag.length() &&
						Character.isWhitespace(lc_tag.charAt(hr_end))){
										
					hr_end--;
				}
				
				String	href = tag.substring(hr_start, hr_end+1 ).trim();
				
				if ( href.startsWith("\"")){
					
					href = href.substring(1,href.length()-1);
				}
				
				current_url = href;
				
				current_url_start = content_out.length();
								
			}else if ( lc_tag.startsWith( "/" ) && lc_tag.substring(1).trim().equals( "a" )){
				
				if ( current_url != null ){
										
					int	len = content_out.length() - current_url_start;
					
					urls.add( new Object[]{ current_url, new int[]{ current_url_start, len }});
				}
				
				current_url = null;
			}
		}
		
		if ( pos < content_in.length()){
			
			content_out += content_in.substring( pos );
		}
		
		return( new Object[]{ content_out, urls });
	
public static voidmain(java.lang.String[] args)

		Object[] obj = getLinks( "aaaaaaa <a href=\"http://here/parp  \">link< / a > prute <a href=\"http://here/pa\">klink</a>" );
		
		System.out.println( obj[0] );
		
		List	urls = (List)obj[1];
		
		for (int i=0;i<urls.size();i++){
			
			Object[]	entry = (Object[])urls.get(i);
			
			System.out.println( "    " + entry[0] + ((int[])entry[1])[0] + "," + ((int[])entry[1])[1] );
		}
	
public static java.lang.StringremoveTagPairs(java.lang.String content, java.lang.String tag_name)

		tag_name = tag_name.toLowerCase();
		
		String	lc_content = content.toLowerCase();
		
		int	pos	= 0;

		String	res = "";
		
		int	level 		= 0;
		int	start_pos	= -1;
		
		while(true){
			
			int	start_tag_start = lc_content.indexOf( "<" + tag_name,  pos );
			int end_tag_start	= lc_content.indexOf( "</" + tag_name, pos );
			
			if ( level == 0 ){
				
				if ( start_tag_start == -1 ){
					
					res += content.substring(pos);

					break;
				}
				
				res += content.substring(pos,start_tag_start);						

				start_pos = start_tag_start;
				
				level	= 1;
				
				pos		= start_pos+1;
				
			}else{
				
				if ( end_tag_start == -1 ){
					
					res += content.substring(pos);

					break;
				}
				
				if ( start_tag_start == -1 || end_tag_start < start_tag_start ){
					
					level--;
					
					int	end_end = lc_content.indexOf( '>", end_tag_start );
					
					if( end_end == -1 ){
						
						break;
					}
					
					pos	= end_end + 1;
					
				}else{
					
					if ( start_tag_start == -1 ){
						
						res += content.substring(pos);

						break;
					}
					
					level++;
					
					pos = start_tag_start+1;
				}
			}
		}
			
		return( res );
	
public static java.lang.StringsplitWithLineLength(java.lang.String str, int length)

		String	res = "";
		
		StringTokenizer tok = new StringTokenizer(str, "\n");
		
		while( tok.hasMoreTokens()){
			
			String	line = tok.nextToken();
			
			while( line.length() > length ){
			
				if ( res.length() > 0 ){
					
					res += "\n";
				}
	
				boolean	done = false;
				
				for (int i=length-1;i>=0;i--){
					
					if ( Character.isWhitespace( line.charAt(i))){
						
						done	= true;
				
						res += line.substring(0,i);
						
						line = line.substring(i+1);
						
						break;
					}
				}
				
				if ( !done ){
					
					res += line.substring(0,length);
					
					line = line.substring( length );
				}
			}
		
			if ( res.length() > 0 && line.length() > 0 ){
				
				res += "\n";

				res += line;
			}
		}
		
		return( res );