Merge branch 'master' of https://git.maemo.org/projects/erwise
[erwise] / Cl / WWWLibrary / SGML.c
1 /*                      General SGML Parser code                SGML.c
2 **                      ========================
3 */
4 #include <ctype.h>
5 #include <stdio.h>
6 #include "HTUtils.h"
7 #include "HTChunk.h"
8 #include "SGML.h"
9
10 #include "tcp.h"        /* For TOUPPER  ! */
11
12 static void             (*contents_treatment) PARAMS((char c));
13 static HTTag            *current_tag;
14 static attr             *current_attribute;
15 static HTChunk          *string =0;
16 static HTElement        *element_stack;
17
18 /*      Handle Attribute
19 **      ----------------
20 */
21 PUBLIC CONST char * SGML_default = "";
22
23 #ifdef __STDC__
24 PRIVATE void handle_attribute_name(const char * s)
25 #else
26 PRIVATE void handle_attribute_name(s)
27     char *s;
28 #endif
29 {
30     for(    current_attribute = current_tag->attributes;
31             current_attribute->name;
32             current_attribute++) {
33         if (0==strcasecomp(current_attribute->name, s))
34             break;
35     }
36     if (!current_attribute->name) {
37         if (TRACE)
38             fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
39                 s, current_tag->name);
40         current_attribute = 0;  /* Invalid */
41         return;
42     }
43     current_attribute->present = YES;
44     if (current_attribute->value) {
45         free(current_attribute->value);
46         current_attribute->value = 0;
47     }
48 }
49
50 /*      Handle attribute value
51 **      ----------------------
52 */
53 #ifdef __STDC__
54 PRIVATE void handle_attribute_value(const char * s)
55 #else
56 PRIVATE void handle_attribute_value(s)
57     char *s;
58 #endif
59 {
60     if (current_attribute) {
61         StrAllocCopy(current_attribute->value, s);
62     } else {
63         if (TRACE) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
64     }
65     current_attribute = 0;      /* can't have two assignments! */
66 }
67
68 /*      Handle entity
69 **      -------------
70 **
71 ** On entry,
72 **      s       contains the entity name zero terminated
73 ** Bugs:
74 **      If the entity name is unknown, the terminator is treated as
75 **      a printable non-special character in all cases, even if it is '<'
76 */
77 #ifdef __STDC__
78 PRIVATE void handle_entity(const char * s, entity * entities, char term)
79 #else
80 PRIVATE void handle_entity(s,entities, term)
81     char * s;
82     entity * entities;
83     char term;
84 #endif
85 {
86     entity * e;
87     for(e = entities; e->name; e++) {
88         if (0==strcmp(e->name, s)) {
89             char * p;
90             for (p=e->representation; *p; p++) {
91                 (*contents_treatment)(*p);
92             }
93             return;     /* Good */
94         }
95     }
96     /* If entity string not found, display as text */
97     if (TRACE)
98         fprintf(stderr, "SGML: Unknown entity %s\n", s); 
99     (*contents_treatment)('&');
100     {
101         CONST char *p;
102         for (p=s; *p; p++) {
103             (*contents_treatment)(*p);
104         }
105     }
106     (*contents_treatment)(term);
107 }
108
109 /*      End element
110 */
111 #ifdef __STDC__
112 PRIVATE void end_element(HTTag * old_tag)
113 #else
114 PRIVATE void end_element(old_tag)
115     HTTag * old_tag;
116 #endif
117 {
118     if (TRACE) fprintf(stderr, "SGML: End   </%s>\n", old_tag->name);
119     if (!old_tag->end) {
120         if (TRACE) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
121                 old_tag->name);
122         return;
123     }
124     while (element_stack)       {/* Loop is error path only */
125         HTElement * N = element_stack;
126         HTTag * t = element_stack->tag;
127         
128         if (old_tag != t) {             /* Mismatch: syntax error */
129             if (element_stack->next) {  /* This is not the last level */
130                 if (TRACE) fprintf(stderr,
131                 "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
132                     old_tag->name, t->name, t->name);
133             } else {                    /* last level */
134                 if (TRACE) fprintf(stderr,
135                     "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
136                     old_tag->name, t->name, old_tag->name);
137                 return;                 /* Ignore */
138             }
139         }
140         
141         element_stack = N->next;                /* Remove from stack */
142         free(N);
143         (t->end)(t, element_stack);             /* Assume tag end */
144         if (element_stack)                      /* not end of document */
145             contents_treatment = element_stack->tag->treat;
146         if (old_tag == t) return;  /* Correct sequence */
147         
148         /* Syntax error path only */
149         
150     }
151     fprintf(stderr,
152         "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
153 }
154
155
156 /*      Start a element
157 */
158 #ifdef __STDC__
159 PRIVATE void start_element(HTTag * new_tag)
160 #else
161 PRIVATE void start_element(new_tag)
162     HTTag * new_tag;
163 #endif
164 {
165     if (TRACE) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
166     (*new_tag->begin)(new_tag, element_stack);
167     if (new_tag->end) {         /* i.e. tag not empty */
168         HTElement * N = (HTElement *)malloc(sizeof(HTElement));
169         if (N == NULL) outofmem(__FILE__, "start_element");
170         N->next = element_stack;
171         N->tag = new_tag;
172         element_stack = N;
173         contents_treatment = new_tag->treat;
174     }
175 }
176
177
178
179 /*      SGML Engine
180 **      -----------
181 **
182 ** On entry,
183 **      dtd->tags               represents the DTD, along with
184 **      dtd->entities
185 **
186 **      default_tag     represents the initial and final actions,
187 **                      and the character processing, for data outside
188 **                      any tags. May not be empty.
189 */
190
191 PRIVATE enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap, 
192                 S_attr, S_attr_gap, S_equals, S_value,
193                   S_quoted, S_end, S_entity, S_junk_tag} state;
194
195 PUBLIC void SGML_begin  ARGS1(SGML_dtd *,dtd)
196 {
197     if (!string) string = HTChunkCreate(128);   /* Grow by this much */
198     
199     state = S_text;
200     start_element(dtd->default_tag);    /* Start document */
201 }
202
203 PUBLIC void SGML_end  ARGS1(SGML_dtd *,dtd)
204 {
205     end_element(dtd->default_tag);      /* End document */
206 }
207
208 PUBLIC void SGML_character ARGS2(SGML_dtd *,dtd, char,c)
209
210 {
211     switch(state) {
212     case S_text:
213         if (c=='&' && !(element_stack &&
214                         element_stack->tag  &&
215                         element_stack->tag->litteral)) {
216             string->size = 0;
217             state = S_entity;
218             
219         } else if (c=='<') {
220             string->size = 0;
221             state = (element_stack &&
222                         element_stack->tag  &&
223                         element_stack->tag->litteral) ?
224                                 S_litteral : S_tag;
225         } else (*contents_treatment)(c);
226         break;
227
228 /*      In litteral mode, waits only for specific end tag!
229 */
230     case S_litteral :
231         HTChunkPutc(string, c);
232         if ( TOUPPER(c) != ((string->size ==1) ? '/'
233                 : element_stack->tag->name[string->size-2])) {
234             int i;
235             
236             /*  If complete match, end litteral */
237             if ((c=='>') && (!element_stack->tag->name[string->size-2])) {
238                 end_element(element_stack->tag);
239                 string->size = 0;
240                 current_attribute = (attr *) 0;
241                 state = S_text;
242                 break;
243             }           /* If Mismatch: recover string. */
244             (*contents_treatment)('<');
245             for (i=0; i<string->size; i++)      /* recover */
246                (*contents_treatment)(string->data[i]);
247             state = S_text;     
248         }
249         
250         break;
251         
252 /*      Handle Entities
253 */
254     case S_entity:
255         if (isalnum(c))
256             HTChunkPutc(string, c);
257         else {
258             HTChunkTerminate(string);
259             handle_entity(string->data, dtd->entities, c);
260             state = S_text;
261         }
262         break;
263         
264 /*              Tag
265 */          
266     case S_tag:                         /* new tag */
267         if (isalnum(c))
268             HTChunkPutc(string, c);
269         else {                          /* End of tag name */
270             attr * a;
271             if (c=='/') {
272                 if (TRACE) if (string->size!=0)
273                     fprintf(stderr,"SGML:  `<%s/' found!\n", string->data);
274                 state = S_end;
275                 break;
276             }
277             HTChunkTerminate(string) ;
278             for(current_tag = dtd->tags; current_tag->name; current_tag++) {
279                 if (0==strcasecomp(current_tag->name, string->data)) {
280                     break;
281                 }
282             }
283             if (!current_tag->name) {
284                 if(TRACE) fprintf(stderr, "Unknown tag %s\n",
285                         string->data);
286                 state = (c=='>') ? S_text : S_junk_tag;
287                 break;
288             }
289             
290             for (a = current_tag->attributes; a->name; a++ ) {
291                 a->present = NO;
292             }
293             string->size = 0;
294             current_attribute = (attr *) 0;
295             
296             if (c=='>') {
297                 if (current_tag->name) start_element(current_tag);
298                 state = S_text;
299             } else {
300                 state = S_tag_gap;
301             }
302         }
303         break;
304
305                 
306     case S_tag_gap:             /* Expecting attribute or > */
307         if (WHITE(c)) break;    /* Gap between attributes */
308         if (c=='>') {           /* End of tag */
309             if (current_tag->name) start_element(current_tag);
310             state = S_text;
311             break;
312         }
313         HTChunkPutc(string, c);
314         state = S_attr;         /* Get attribute */
315         break;
316         
317                                 /* accumulating value */
318     case S_attr:
319         if (WHITE(c) || (c=='>') || (c=='=')) {         /* End of word */
320             HTChunkTerminate(string) ;
321             handle_attribute_name(string->data);
322             string->size = 0;
323             if (c=='>') {               /* End of tag */
324                 if (current_tag->name) start_element(current_tag);
325                 state = S_text;
326                 break;
327             }
328             state = (c=='=' ?  S_equals: S_attr_gap);
329         } else {
330             HTChunkPutc(string, c);
331         }
332         break;
333                 
334     case S_attr_gap:            /* Expecting attribute or = or > */
335         if (WHITE(c)) break;    /* Gap after attribute */
336         if (c=='>') {           /* End of tag */
337             if (current_tag->name) start_element(current_tag);
338             state = S_text;
339             break;
340         } else if (c=='=') {
341             state = S_equals;
342             break;
343         }
344         HTChunkPutc(string, c);
345         state = S_attr;         /* Get next attribute */
346         break;
347         
348     case S_equals:                      /* After attr = */ 
349         if (WHITE(c)) break;    /* Before attribute value */
350         if (c=='>') {           /* End of tag */
351             fprintf(stderr, "SGML: found = but no value\n");
352             if (current_tag->name) start_element(current_tag);
353             state = S_text;
354             break;
355             
356         } else if (c=='"') {
357             state = S_quoted;
358             break;
359         }
360         HTChunkPutc(string, c);
361         state = S_value;
362         break;
363         
364     case S_value:
365         if (WHITE(c) || (c=='>')) {             /* End of word */
366             HTChunkTerminate(string) ;
367             handle_attribute_value(string->data);
368             string->size = 0;
369             if (c=='>') {               /* End of tag */
370                 if (current_tag->name) start_element(current_tag);
371                 state = S_text;
372                 break;
373             }
374             else state = S_tag_gap;
375         } else {
376             HTChunkPutc(string, c);
377         }
378         break;
379                 
380     case S_quoted:                      /* Quoted attribute value */
381         if (c=='"') {           /* End of attribute value */
382             HTChunkTerminate(string) ;
383             handle_attribute_value(string->data);
384             string->size = 0;
385             state = S_tag_gap;
386         } else {
387             HTChunkPutc(string, c);
388         }
389         break;
390         
391     case S_end:                                 /* </ */
392         if (isalnum(c))
393             HTChunkPutc(string, c);
394         else {                          /* End of end tag name */
395             HTChunkTerminate(string) ;
396             if (c!='>') {
397                 if (TRACE) fprintf(stderr,"SGML:  `</%s%c' found!\n",
398                     string->data, c);
399                 state = S_junk_tag;
400                 break;
401             }
402             for(current_tag = dtd->tags; current_tag->name; current_tag++) {
403                 if (0==strcasecomp(current_tag->name, string->data)) {
404                     end_element(current_tag);
405                     break;
406                 }
407             }
408             if (!current_tag->name) {
409                 if(TRACE) fprintf(stderr,
410                     "Unknown end tag </%s>\n", string->data); 
411             }
412             string->size = 0;
413             current_attribute = (attr *) 0;
414             state = S_text;
415         }
416         break;
417
418                 
419     case S_junk_tag:
420         if (c=='>') {
421             state = S_text;
422         }
423         
424     } /* switch on state */
425
426 }