Methods Summary |
---|
public static java.util.List | convertHTMLToText(java.lang.String indent, java.lang.String text)returns a list of strings for each line in a basic text representation
int pos = 0;
int orderedIndex = 0;
text = text.replaceAll("<ol>","");
text = text.replaceAll("</ol>","");
text = text.replaceAll("<ul>","");
text = text.replaceAll("</ul>","");
text = text.replaceAll("</li>","");
text = text.replaceAll("<li>","\n\t*");
String lc_text = text.toLowerCase();
List lines = new ArrayList();
while( true ){
String line;
String[] tokens = new String[]{ "<br>", "<p>" };
String token = null;
int p1 = -1;
for (int i=0;i<tokens.length;i++){
int x = lc_text.indexOf( tokens[i], pos );
if ( x != -1 ){
if ( p1 == -1 || x < p1 ){
token = tokens[i];
p1 = x;
}
}
}
if ( p1 == -1 ){
line = text.substring(pos);
}else{
line = text.substring(pos,p1);
pos = p1+token.length();
}
lines.add( indent + line );
if ( p1 == -1 ){
break;
}
}
return( lines );
|
public static java.lang.String | convertHTMLToText2(java.lang.String content)
int pos = 0;
String res = "";
content = removeTagPairs( content, "script" );
content = content.replaceAll( " ", " " );
content = content.replaceAll( "[\\s]+", " " );
while(true){
int p1 = content.indexOf( "<", pos );
if ( p1 == -1 ){
res += content.substring(pos);
break;
}
int p2 = content.indexOf( ">", p1 );
if ( p2 == -1 ){
res += content.substring(pos);
break;
}
String tag = content.substring(p1+1,p2).toLowerCase();
res += content.substring(pos,p1);
if ( tag.equals("p") || tag.equals("br")){
if ( res.length() > 0 && res.charAt(res.length()-1) != '\n" ){
res += "\n";
}
}
pos = p2+1;
}
res = res.replaceAll( "[ \\t\\x0B\\f\\r]+", " " );
res = res.replaceAll( "[ \\t\\x0B\\f\\r]+\\n", "\n" );
res = res.replaceAll( "\\n[ \\t\\x0B\\f\\r]+", "\n" );
if ( res.length() > 0 && Character.isWhitespace(res.charAt(0))){
res = res.substring(1);
}
return( res );
|
public static java.lang.String | convertListToString(java.util.List list)
StringBuffer result = new StringBuffer();
String separator = "";
Iterator iter = list.iterator();
while(iter.hasNext()) {
String line = iter.next().toString();
result.append(separator);
result.append(line);
separator = "\n";
}
return result.toString();
|
public static java.lang.String | expand(java.lang.String str)
str = XUXmlWriter.unescapeXML( str );
str = str.replaceAll( " ", " " );
return( str );
|
public static java.lang.Object[] | getLinks(java.lang.String content_in)
int pos = 0;
List urls = new ArrayList();
String content_out = "";
String current_url = null;
int current_url_start = -1;
while(true){
int p1 = content_in.indexOf( "<", pos );
if ( p1 == -1 ){
break;
}
p1++;
int p2 = content_in.indexOf( ">", p1 );
if ( p2 == -1 ){
break;
}
if ( p1 > pos ){
content_out += content_in.substring( pos, p1-1 );
}
int old_pos = pos;
pos = p2+1;
String tag = content_in.substring( p1, p2 ).trim();
String lc_tag = tag.toLowerCase();
if ( lc_tag.startsWith("a " )){
int hr_start = lc_tag.indexOf( "href");
if ( hr_start == -1 ){
continue;
}
hr_start = lc_tag.indexOf("=", hr_start);
if ( hr_start == -1 ){
continue;
}
hr_start += 1;
while( hr_start < lc_tag.length() &&
Character.isWhitespace(lc_tag.charAt(hr_start))){
hr_start++;
}
int hr_end = lc_tag.length()-1;
while( hr_end >= lc_tag.length() &&
Character.isWhitespace(lc_tag.charAt(hr_end))){
hr_end--;
}
String href = tag.substring(hr_start, hr_end+1 ).trim();
if ( href.startsWith("\"")){
href = href.substring(1,href.length()-1);
}
current_url = href;
current_url_start = content_out.length();
}else if ( lc_tag.startsWith( "/" ) && lc_tag.substring(1).trim().equals( "a" )){
if ( current_url != null ){
int len = content_out.length() - current_url_start;
urls.add( new Object[]{ current_url, new int[]{ current_url_start, len }});
}
current_url = null;
}
}
if ( pos < content_in.length()){
content_out += content_in.substring( pos );
}
return( new Object[]{ content_out, urls });
|
public static void | main(java.lang.String[] args)
Object[] obj = getLinks( "aaaaaaa <a href=\"http://here/parp \">link< / a > prute <a href=\"http://here/pa\">klink</a>" );
System.out.println( obj[0] );
List urls = (List)obj[1];
for (int i=0;i<urls.size();i++){
Object[] entry = (Object[])urls.get(i);
System.out.println( " " + entry[0] + ((int[])entry[1])[0] + "," + ((int[])entry[1])[1] );
}
|
public static java.lang.String | removeTagPairs(java.lang.String content, java.lang.String tag_name)
tag_name = tag_name.toLowerCase();
String lc_content = content.toLowerCase();
int pos = 0;
String res = "";
int level = 0;
int start_pos = -1;
while(true){
int start_tag_start = lc_content.indexOf( "<" + tag_name, pos );
int end_tag_start = lc_content.indexOf( "</" + tag_name, pos );
if ( level == 0 ){
if ( start_tag_start == -1 ){
res += content.substring(pos);
break;
}
res += content.substring(pos,start_tag_start);
start_pos = start_tag_start;
level = 1;
pos = start_pos+1;
}else{
if ( end_tag_start == -1 ){
res += content.substring(pos);
break;
}
if ( start_tag_start == -1 || end_tag_start < start_tag_start ){
level--;
int end_end = lc_content.indexOf( '>", end_tag_start );
if( end_end == -1 ){
break;
}
pos = end_end + 1;
}else{
if ( start_tag_start == -1 ){
res += content.substring(pos);
break;
}
level++;
pos = start_tag_start+1;
}
}
}
return( res );
|
public static java.lang.String | splitWithLineLength(java.lang.String str, int length)
String res = "";
StringTokenizer tok = new StringTokenizer(str, "\n");
while( tok.hasMoreTokens()){
String line = tok.nextToken();
while( line.length() > length ){
if ( res.length() > 0 ){
res += "\n";
}
boolean done = false;
for (int i=length-1;i>=0;i--){
if ( Character.isWhitespace( line.charAt(i))){
done = true;
res += line.substring(0,i);
line = line.substring(i+1);
break;
}
}
if ( !done ){
res += line.substring(0,length);
line = line.substring( length );
}
}
if ( res.length() > 0 && line.length() > 0 ){
res += "\n";
res += line;
}
}
return( res );
|