1    // Copyright 2004 Adam Megacz, see the COPYING file for licensing [GPL]
2    package org.xwt.translators;
3    
4    import java.util.*;
5    import java.net.*;
6    import java.io.*;
7    import org.xwt.js.*;
8    import org.xwt.util.*;
9    
10   /* 
11    * While entities are limited to a subset of Unicode characters ,
12    * numeric character references can specify any character. Numeric
13    * character references may be given in decimal or hexadecimal, though
14    * browser support is stronger for decimal references. Decimal
15    * references are of the form &#number; while hexadecimal references
16    * take the case-insensitive form &#xnumber;. Examples of numeric
17    * character references include © or © for the copyright
18    * symbol, Α or Α for the Greek capital letter alpha, and
19    * ا or ا for the Arabic letter ALEF.
20    *
21    * http://www.htmlhelp.com/reference/html40/entities/special.html
22    * http://www.htmlhelp.com/reference/html40/entities/symbols.html
23    * http://www.htmlhelp.com/reference/html40/entities/latin1.html
24    */
25   
26   /**
27    *   This class parses an InputStream containing HTML and returns it
28    *   as an XWT DOM tree. Each HTML Element is returned as a struct,
29    *   with the following members:
30    *
31    *   Since HTML may have multiple top level elements (unlike XML),
32    *   this class will search all top level elements for one with a tag
33    *   name 'html'. If such a node is found, only it is returned. If no
34    *   top-level element has the tag name 'html', such a node is
35    *   fabricated, and all top level elements become the children of
36    *   that node, which is then returned.
37    */
38   public class HTML {
39   
40       private final static String[] noEndTag =
41           new String[] { "area", "base", "basefont", "br", "col", "frame", "hr", "img",
42                          "input", "isindex", "link", "meta", "param" };
43   
44       /** we keep a char[] around for use by removeRedundantWhitespace() */
45       private static char[] cbuf = null;
46   
47       /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
48       private static StringBuffer sbuf = null;
49   
50       /** true iff we have encountered an LI more recently than the last OL/UL */
51       private static boolean withinLI = false;
52   
53       public static synchronized JS parseReader(Reader r) throws IOException, JSExn {
54           CharStream cs = new CharStream(r);
55           JS h = new JS();
56   
57           withinLI = false;
58           h.put("$name", "html");
59   
60           try {
61               while (true) parseBody(cs, h, null);
62           } catch (EOFException e) {
63               // continue until we get an EOFException
64           }
65           
66           /* FIXME
67           Object[] ids = h.keys();
68           for(int i=0; i<ids.length; i++) {
69               Object el = h.get((String)ids[i]);
70               if (el instanceof JS && "html".equals(((JS)el).get("$name")))
71                   return (JS)el;
72           }
73           */        
74           return h;
75       }
76   
77       /**
78        *  Parses a single element and stores it in <tt>h</tt>. The
79        *  CharStream should be positioned immediately <i>after</i> the
80        *  open bracket.
81        *
82        *  If a close tag not matching this open tag is found, the
83        *  tagname on the close tag will be returned in order to
84        *  facilitate correcting broken HTML. Otherwise, this returns
85        *  null.
86        */
87       private static String parseElement(CharStream cs, JS h) throws IOException, JSExn {
88           // scan element name
89           while(Character.isSpace(cs.peek())) cs.get();
90           String elementName = parseElementName(cs);
91   
92           boolean saveWithinLI = withinLI;
93           if (elementName.equals("li")) {
94               if (withinLI) {
95                   cs.unread(new char[] { '<', 'l', 'i', ' ' });
96                   return "li";
97               } else {
98                   withinLI = true;
99               }
100          } else if (elementName.equals("ol") || elementName.equals("ul")) {
101              withinLI = false;
102          }
103  
104          h.put("$name", elementName);
105          if (elementName.equals("!--")) {
106              h.put("0", parseComment(cs));
107              h.put("$numchildren", new Integer(0));
108              return null;
109          }
110  
111          // scan attributes
112          while (cs.peek() != '>') {
113              String name = parseAttributeName(cs);
114              if (name.equals("")) break;
115              String value = expandEntities(parseAttributeValue(cs));
116              h.put(name, value);
117          } 
118  
119          // eat the close-angle bracket
120          cs.get();
121  
122          // bodyless tags return here
123          for(int i=0; i<noEndTag.length; i++)
124              if (noEndTag[i].equals(elementName))
125                  return null;
126  
127          // scan body
128          String ret = parseBody(cs, h, elementName);
129          withinLI = saveWithinLI;
130          return ret;
131      }
132  
133      /**
134       *  Parses the body of an element. The CharStream should be
135       *  positioned at the character immediately after the right
136       *  bracket closing the start-tag
137       */
138      private static String parseBody(CharStream cs, JS h, String elementName) throws IOException, JSExn {
139          String cdata = "";
140          int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
141          while(true) {
142              String closetag = null;
143  
144              try {
145                  char c = cs.get();
146                  if (c != '<') { cdata += c; continue; }
147                  String expanded = removeRedundantWhitespace(expandEntities(cdata));
148                  if (expanded.length() > 0) {
149                      h.put(String.valueOf(length), expanded);
150                      h.put("$numchildren", new Integer(++length));
151                  }
152                  cdata = "";
153  
154              } catch (EOFException e) {
155                  String expanded = removeRedundantWhitespace(expandEntities(cdata));
156                  if (expanded.length() > 0) {
157                      h.put(String.valueOf(length), expanded);
158                      h.put("$numchildren", new Integer(++length));
159                  }
160                  throw e;
161              }
162                  
163              try {
164                  // scan subelement
165                  if (cs.peek() != '/') {
166                      JS kid = new JS();
167                      closetag = parseElement(cs, kid);
168                      h.put(String.valueOf(length), kid); 
169                      h.put("$numchildren", new Integer(++length));
170                      
171                  // scan close-tag
172                  } else {
173                      cs.get(); // drop the slash
174                      closetag = parseElementName(cs);
175                      while(cs.get() != '>');
176                  }
177              } catch (EOFException e) {
178                  throw e;
179  
180              }
181              
182              if (closetag != null)
183                  return closetag.equals(elementName) ? null : closetag;
184          }
185      }
186  
187      /** Parses an element name and returns it. The CharStream should
188       *  be positioned at the first character of the name.
189       */
190      private static String parseElementName(CharStream cs) throws IOException, JSExn {
191          String ret = "";
192          while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
193          return ret.toLowerCase();
194      }
195  
196      /** Parses an attribute name and returns it. The CharStream should
197       *  be positioned at the first character of the name, possibly
198       *  with intervening whitespace.
199       */
200      private static String parseAttributeName(CharStream cs) throws IOException, JSExn {
201          while(Character.isSpace(cs.peek())) cs.get();
202          String ret = "";
203          while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
204          return ret.toLowerCase();
205      }
206  
207      /** Parses an attribute value and returns it. The CharStream
208       *  should be positioned at the equals sign, possibly with
209       *  intervening whitespace.
210       */
211      private static String parseAttributeValue(CharStream cs) throws IOException, JSExn {
212  
213          // eat whitespace and equals sign
214          while(Character.isSpace(cs.peek())) cs.get();
215          if (cs.peek() != '=') return "";
216          cs.get();
217          while(Character.isSpace(cs.peek())) cs.get();
218  
219          boolean doublequoted = false;
220          boolean singlequoted = false;
221          String ret = "";
222  
223          if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
224          else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
225  
226          while(true) {
227              char c = cs.peek();
228              if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
229              if (singlequoted && c == '\'') { cs.get(); break; }
230              if (doublequoted && c == '\"') { cs.get(); break; }
231              ret += cs.get();
232          }
233          return ret;
234      }
235  
236      /** Parses a comment and returns its body. The CharStream should
237       *  be positioned immediately after the <!--
238       */
239      private static String parseComment(CharStream cs) throws IOException, JSExn {
240          int dashes = 0;
241          String ret = "";
242          while(true) {
243              char c = cs.get();
244              if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
245              if (c == '-') dashes++;
246              else dashes = 0;
247              ret += c;
248          }
249      }
250  
251      /** Expands all SGML entities in string <tt>s</tt> */
252      public static String expandEntities(String s) throws IOException, JSExn {
253          if (s.indexOf('&') == -1) return s;
254          StringBuffer sb = new StringBuffer();
255          int i=0;
256          int nextamp = 0;
257          while(nextamp != -1) {
258              nextamp = s.indexOf('&', i);
259              sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
260              if (nextamp == -1) break;
261              if (s.regionMatches(nextamp, "&", 0, 5)) {
262                  sb.append("&");
263                  i = nextamp + 5;
264              } else if (s.regionMatches(nextamp, ">", 0, 4)) {
265                  sb.append(">");
266                  i = nextamp + 4;
267              } else if (s.regionMatches(nextamp, "<", 0, 4)) {
268                  sb.append("<");
269                  i = nextamp + 4;
270              } else if (s.regionMatches(nextamp, """, 0, 6)) {
271                  sb.append("\"");
272                  i = nextamp + 6;
273              } else if (s.regionMatches(nextamp, " ", 0, 6)) {
274                  // FEATURE: perhaps we should distinguish this somehow
275                  sb.append(" ");
276                  i = nextamp + 6;
277              } else {
278                  sb.append("&");
279                  i = nextamp + 1;
280              }
281          }
282          return sb.toString();
283      }
284  
285      /** removes all redundant whitespace */
286      private static String removeRedundantWhitespace(String s) throws JSExn {
287  
288          if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
289  
290          int len = s.length();
291          if (cbuf == null || cbuf.length < len) {
292              cbuf = new char[len * 2];
293              sbuf = new StringBuffer(len * 2);
294          }
295          sbuf.setLength(0);
296          s.getChars(0, len, cbuf, 0);
297  
298          int last = 0;
299          boolean lastWasWhitespace = false;
300          for(int i=0; i<len; i++) {
301              boolean lastlast = lastWasWhitespace;
302              switch(cbuf[i]) {
303              case '\n': case '\r': case '\t':
304                  cbuf[i] = ' ';
305              case ' ':
306                  lastWasWhitespace = true;
307                  break;
308              default:
309                  lastWasWhitespace = false;
310                  break;
311              }
312              if (lastWasWhitespace && lastlast) {
313                  if (last != i) sbuf.append(cbuf, last, i - last);
314                  last = i+1;
315              }
316          }
317              
318          if (last != len) sbuf.append(cbuf, last, len - last);
319          return sbuf.toString().trim();
320      }
321  
322      // CharStream /////////////////////////////////////////////////////////////////////
323  
324      private static class CharStream extends PushbackReader {
325          public CharStream(Reader r) { super(r, 1024); }
326  
327          public char peek() throws IOException {
328              char c = get();
329              unread(c);
330              return c;
331          }
332  
333          public char get() throws IOException {
334              int i = read();
335              if (i == -1) throw new EOFException();
336              return (char)i;
337          }
338      }
339  
340  }
341  
342