1    // Copyright 2002 Adam Megacz, see the COPYING file for licensing [GPL]
2    package org.xwt;
3    
4    import java.util.*;
5    import java.net.*;
6    import java.io.*;
7    import org.xwt.js.*;
8    import org.xwt.util.*;
9    
10   /* 
11    * While entities are limited to a subset of Unicode characters ,
12    * numeric character references can specify any character. Numeric
13    * character references may be given in decimal or hexadecimal, though
14    * browser support is stronger for decimal references. Decimal
15    * references are of the form &#number; while hexadecimal references
16    * take the case-insensitive form &#xnumber;. Examples of numeric
17    * character references include © or © for the copyright
18    * symbol, Α or Α for the Greek capital letter alpha, and
19    * ا or ا for the Arabic letter ALEF.
20    *
21    * http://www.htmlhelp.com/reference/html40/entities/special.html
22    * http://www.htmlhelp.com/reference/html40/entities/symbols.html
23    * http://www.htmlhelp.com/reference/html40/entities/latin1.html
24    */
25   
26   /**
27    *   This class parses an InputStream containing HTML and returns it
28    *   as an XWT DOM tree. Each HTML Element is returned as a struct,
29    *   with the following members:
30    *
31    *   Since HTML may have multiple top level elements (unlike XML),
32    *   this class will search all top level elements for one with a tag
33    *   name 'html'. If such a node is found, only it is returned. If no
34    *   top-level element has the tag name 'html', such a node is
35    *   fabricated, and all top level elements become the children of
36    *   that node, which is then returned.
37    */
38   public class HTML {
39   
40       // FIXME: fill in
41       private final static String[] bodylessTags = new String[] { "br", "hr", "input", "img", "isindex" };
42   
43       /** we keep a char[] around for use by removeRedundantWhitespace() */
44       private static char[] cbuf = null;
45   
46       /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
47       private static StringBuffer sbuf = null;
48   
49       /** true iff we have encountered an LI more recently than the last OL/UL */
50       private static boolean withinLI = false;
51   
52       public static synchronized JS parseReader(Reader r) throws IOException {
53           CharStream cs = new CharStream(r);
54           JS.Obj h = new JS.Obj();
55   
56           withinLI = false;
57           h.put("$name", "html");
58   
59           try {
60               while (true) parseBody(cs, h, null);
61           } catch (EOFException e) {
62               // continue until we get an EOFException
63           }
64           
65           Object[] ids = h.keys();
66           for(int i=0; i<ids.length; i++) {
67               Object el = h.get((String)ids[i]);
68               if (el instanceof JS && "html".equals(((JS)el).get("$name")))
69                   return (JS)el;
70           }
71           
72           return h;
73       }
74   
75       /**
76        *  Parses a single element and stores it in <tt>h</tt>. The
77        *  CharStream should be positioned immediately <i>after</i> the
78        *  open bracket.
79        *
80        *  If a close tag not matching this open tag is found, the
81        *  tagname on the close tag will be returned in order to
82        *  facilitate correcting broken HTML. Otherwise, this returns
83        *  null.
84        */
85       private static String parseElement(CharStream cs, JS h) throws IOException {
86           // scan element name
87           while(Character.isSpace(cs.peek())) cs.get();
88           String elementName = parseElementName(cs);
89   
90           // FIXME: this might not deal correctly with EOFExceptions
91           boolean saveWithinLI = withinLI;
92           if (elementName.equals("li")) {
93               if (withinLI) {
94                   cs.unread(new char[] { '<', 'l', 'i', ' ' });
95                   return "li";
96               } else {
97                   withinLI = true;
98               }
99           } else if (elementName.equals("ol") || elementName.equals("ul")) {
100              withinLI = false;
101          }
102  
103          h.put("$name", elementName);
104          if (elementName.equals("!--")) {
105              h.put("0", parseComment(cs));
106              h.put("$numchildren", new Integer(0));
107              return null;
108          }
109  
110          // scan attributes
111          while (cs.peek() != '>') {
112              String name = parseAttributeName(cs);
113              if (name.equals("")) break;
114              String value = expandEntities(parseAttributeValue(cs));
115              h.put(name, value);
116          } 
117  
118          // eat the close-angle bracket
119          cs.get();
120  
121          // bodyless tags return here
122          for(int i=0; i<bodylessTags.length; i++)
123              if (bodylessTags[i].equals(elementName))
124                  return null;
125  
126          // scan body
127          String ret = parseBody(cs, h, elementName);
128          withinLI = saveWithinLI;
129          return ret;
130      }
131  
132      /**
133       *  Parses the body of an element. The CharStream should be
134       *  positioned at the character immediately after the right
135       *  bracket closing the start-tag
136       */
137      private static String parseBody(CharStream cs, JS h, String elementName) throws IOException {
138          String cdata = "";
139          int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
140          while(true) {
141              String closetag = null;
142  
143              try {
144                  char c = cs.get();
145                  if (c != '<') { cdata += c; continue; }
146                  String expanded = removeRedundantWhitespace(expandEntities(cdata));
147                  if (expanded.length() > 0) {
148                      h.put(String.valueOf(length), expanded);
149                      h.put("$numchildren", new Integer(++length));
150                  }
151                  cdata = "";
152  
153              } catch (EOFException e) {
154                  String expanded = removeRedundantWhitespace(expandEntities(cdata));
155                  if (expanded.length() > 0) {
156                      h.put(String.valueOf(length), expanded);
157                      h.put("$numchildren", new Integer(++length));
158                  }
159                  throw e;
160              }
161                  
162              try {
163                  // scan subelement
164                  if (cs.peek() != '/') {
165                      JS kid = new JS.Obj();
166                      closetag = parseElement(cs, kid);
167                      h.put(String.valueOf(length), kid); 
168                      h.put("$numchildren", new Integer(++length));
169                      
170                  // scan close-tag
171                  } else {
172                      cs.get(); // drop the slash
173                      closetag = parseElementName(cs);
174                      while(cs.get() != '>');
175                  }
176              } catch (EOFException e) {
177                  throw e;
178  
179              }
180              
181              if (closetag != null)
182                  return closetag.equals(elementName) ? null : closetag;
183          }
184      }
185  
186      /** Parses an element name and returns it. The CharStream should
187       *  be positioned at the first character of the name.
188       */
189      private static String parseElementName(CharStream cs) throws IOException {
190          String ret = "";
191          while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
192          return ret.toLowerCase();
193      }
194  
195      /** Parses an attribute name and returns it. The CharStream should
196       *  be positioned at the first character of the name, possibly
197       *  with intervening whitespace.
198       */
199      private static String parseAttributeName(CharStream cs) throws IOException {
200          while(Character.isSpace(cs.peek())) cs.get();
201          String ret = "";
202          while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
203          return ret.toLowerCase();
204      }
205  
206      /** Parses an attribute value and returns it. The CharStream
207       *  should be positioned at the equals sign, possibly with
208       *  intervening whitespace.
209       */
210      private static String parseAttributeValue(CharStream cs) throws IOException {
211  
212          // eat whitespace and equals sign
213          while(Character.isSpace(cs.peek())) cs.get();
214          if (cs.peek() != '=') return "";
215          cs.get();
216          while(Character.isSpace(cs.peek())) cs.get();
217  
218          boolean doublequoted = false;
219          boolean singlequoted = false;
220          String ret = "";
221  
222          if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
223          else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
224  
225          while(true) {
226              char c = cs.peek();
227              if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
228              if (singlequoted && c == '\'') { cs.get(); break; }
229              if (doublequoted && c == '\"') { cs.get(); break; }
230              ret += cs.get();
231          }
232          return ret;
233      }
234  
235      /** Parses a comment and returns its body. The CharStream should
236       *  be positioned immediately after the <!--
237       */
238      private static String parseComment(CharStream cs) throws IOException {
239          int dashes = 0;
240          String ret = "";
241          while(true) {
242              char c = cs.get();
243              if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
244              if (c == '-') dashes++;
245              else dashes = 0;
246              ret += c;
247          }
248      }
249  
250      /** Expands all SGML entities in string <tt>s</tt> */
251      public static String expandEntities(String s) throws IOException {
252          if (s.indexOf('&') == -1) return s;
253          StringBuffer sb = new StringBuffer();
254          int i=0;
255          int nextamp = 0;
256          while(nextamp != -1) {
257              nextamp = s.indexOf('&', i);
258              sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
259              if (nextamp == -1) break;
260              if (s.regionMatches(nextamp, "&", 0, 5)) {
261                  sb.append("&");
262                  i = nextamp + 5;
263              } else if (s.regionMatches(nextamp, ">", 0, 4)) {
264                  sb.append(">");
265                  i = nextamp + 4;
266              } else if (s.regionMatches(nextamp, "<", 0, 4)) {
267                  sb.append("<");
268                  i = nextamp + 4;
269              } else if (s.regionMatches(nextamp, """, 0, 6)) {
270                  sb.append("\"");
271                  i = nextamp + 6;
272              } else if (s.regionMatches(nextamp, " ", 0, 6)) {
273                  // FIXME: should have a way to indicate this...
274                  sb.append(" ");
275                  i = nextamp + 6;
276              } else {
277                  sb.append("&");
278                  i = nextamp + 1;
279              }
280          }
281          return sb.toString();
282      }
283  
284      // FIXME double check this
285      /** removes all redundant whitespace */
286      private static String removeRedundantWhitespace(String s) {
287  
288          if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
289  
290          int len = s.length();
291          if (cbuf == null || cbuf.length < len) {
292              cbuf = new char[len * 2];
293              sbuf = new StringBuffer(len * 2);
294          }
295          sbuf.setLength(0);
296          s.getChars(0, len, cbuf, 0);
297  
298          int last = 0;
299          boolean lastWasWhitespace = false;
300          for(int i=0; i<len; i++) {
301              boolean lastlast = lastWasWhitespace;
302              switch(cbuf[i]) {
303              case '\n': case '\r': case '\t':
304                  cbuf[i] = ' ';
305              case ' ':
306                  lastWasWhitespace = true;
307                  break;
308              default:
309                  lastWasWhitespace = false;
310                  break;
311              }
312              if (lastWasWhitespace && lastlast) {
313                  if (last != i) sbuf.append(cbuf, last, i - last);
314                  last = i+1;
315              }
316          }
317              
318          if (last != len) sbuf.append(cbuf, last, len - last);
319          return sbuf.toString().trim();
320      }
321  
322      // CharStream /////////////////////////////////////////////////////////////////////
323  
324      private static class CharStream extends PushbackReader {
325          public CharStream(Reader r) { super(r, 1024); }
326  
327          public char peek() throws IOException {
328              char c = get();
329              unread(c);
330              return c;
331          }
332  
333          public char get() throws IOException {
334              int i = read();
335              if (i == -1) throw new EOFException();
336              return (char)i;
337          }
338      }
339  
340  }
341  
342