Kannel: Open Source WAP and SMS gateway  svn-r5335
html.c
Go to the documentation of this file.
1 /* ====================================================================
2  * The Kannel Software License, Version 1.0
3  *
4  * Copyright (c) 2001-2018 Kannel Group
5  * Copyright (c) 1998-2001 WapIT Ltd.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in
17  * the documentation and/or other materials provided with the
18  * distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  * if any, must include the following acknowledgment:
22  * "This product includes software developed by the
23  * Kannel Group (http://www.kannel.org/)."
24  * Alternately, this acknowledgment may appear in the software itself,
25  * if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Kannel" and "Kannel Group" must not be used to
28  * endorse or promote products derived from this software without
29  * prior written permission. For written permission, please
30  * contact org@kannel.org.
31  *
32  * 5. Products derived from this software may not be called "Kannel",
33  * nor may "Kannel" appear in their name, without prior written
34  * permission of the Kannel Group.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED. IN NO EVENT SHALL THE KANNEL GROUP OR ITS CONTRIBUTORS
40  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
41  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
42  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
45  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
46  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  *
49  * This software consists of voluntary contributions made by many
50  * individuals on behalf of the Kannel Group. For more information on
51  * the Kannel Group, please see <http://www.kannel.org/>.
52  *
53  * Portions of this software are based upon software originally written at
54  * WapIT Ltd., Helsinki, Finland for the Kannel project.
55  */
56 
57 /*
58  * html.c - routines for manipulating HTML.
59  *
60  * Lars Wirzenius
61  */
62 
63 
64 #include <ctype.h>
65 #include <stdio.h>
66 #include <string.h>
67 
68 #include "html.h"
69 #include "gwlib/gwlib.h"
70 
71 #define SMS_MAX 161
72 
73 
74 /* Is there a comment beginning at offset `pos'? */
75 static int html_comment_begins(Octstr *html, long pos)
76 {
77  char buf[10];
78 
79  octstr_get_many_chars(buf, html, pos, 4);
80  buf[5] = '\0';
81  return strcmp(buf, "<!--") == 0;
82 }
83 
84 
85 /* Skip a comment in HTML. */
86 static void skip_html_comment(Octstr *html, long *pos)
87 {
88  long i;
89 
90  *pos += 4; /* Skip "<!--" at beginning of comment. */
91  i = octstr_search(html, octstr_imm("-->"), *pos);
92  if (i == -1)
93  *pos = octstr_len(html);
94  else
95  *pos = i;
96 }
97 
98 
99 /* Skip a beginning or ending tag in HTML, including any attributes. */
100 static void skip_html_tag(Octstr *html, long *pos)
101 {
102  long i, len;
103  int c;
104 
105  /* Skip leading '<'. */
106  ++(*pos);
107 
108  /* Skip name of tag and attributes with values. */
109  len = octstr_len(html);
110  while (*pos < len && (c = octstr_get_char(html, *pos)) != '>') {
111  if (c == '"' || c == '\'') {
112  i = octstr_search_char(html, c, *pos + 1);
113  if (i == -1)
114  *pos = len;
115  else
116  *pos = i + 1;
117  } else
118  ++(*pos);
119  }
120 
121  /* Skip trailing '>' if it is there. */
122  if (octstr_get_char(html, *pos) == '>')
123  ++(*pos);
124 }
125 
126 
127 /* Convert an HTML entity into a single character and advance `*html' past
128  the entity. */
129 static void convert_html_entity(Octstr *sms, Octstr *html, long *pos)
130 {
131  static struct {
132  char *entity;
133  int latin1;
134  }
135  tab[] = {
136  { "&amp;", '&' },
137  { "&lt;", '<' },
138  { "&gt;", '>' },
139 
140  /* The following is copied from
141 
142  http://www.hut.fi/~jkorpela/HTML3.2/latin1.html
143 
144  by Jukka Korpela. Hand and script edited to form this
145  table. */
146 
147  { "&nbsp;", ' ' },
148  { "&iexcl;", 161 },
149  { "&cent;", 162 },
150  { "&pound;", 163 },
151  { "&curren;", 164 },
152  { "&yen;", 165 },
153  { "&brvbar;", 166 },
154  { "&sect;", 167 },
155  { "&uml;", 168 },
156  { "&copy;", 169 },
157  { "&ordf;", 170 },
158  { "&laquo;", 171 },
159  { "&not;", 172 },
160  { "&shy;", 173 },
161  { "&reg;", 174 },
162  { "&macr;", 175 },
163  { "&deg;", 176 },
164  { "&plusmn;", 177 },
165  { "&sup2;", 178 },
166  { "&sup3;", 179 },
167  { "&acute;", 180 },
168  { "&micro;", 181 },
169  { "&para;", 182 },
170  { "&middot;", 183 },
171  { "&cedil;", 184 },
172  { "&sup1;", 185 },
173  { "&ordm;", 186 },
174  { "&raquo;", 187 },
175  { "&frac14;", 188 },
176  { "&frac12;", 189 },
177  { "&frac34;", 190 },
178  { "&iquest;", 191 },
179  { "&Agrave;", 192 },
180  { "&Aacute;", 193 },
181  { "&Acirc;", 194 },
182  { "&Atilde;", 195 },
183  { "&Auml;", 196 },
184  { "&Aring;", 197 },
185  { "&AElig;", 198 },
186  { "&Ccedil;", 199 },
187  { "&Egrave;", 200 },
188  { "&Eacute;", 201 },
189  { "&Ecirc;", 202 },
190  { "&Euml;", 203 },
191  { "&Igrave;", 204 },
192  { "&Iacute;", 205 },
193  { "&Icirc;", 206 },
194  { "&Iuml;", 207 },
195  { "&ETH;", 208 },
196  { "&Ntilde;", 209 },
197  { "&Ograve;", 210 },
198  { "&Oacute;", 211 },
199  { "&Ocirc;", 212 },
200  { "&Otilde;", 213 },
201  { "&Ouml;", 214 },
202  { "&times;", 215 },
203  { "&Oslash;", 216 },
204  { "&Ugrave;", 217 },
205  { "&Uacute;", 218 },
206  { "&Ucirc;", 219 },
207  { "&Uuml;", 220 },
208  { "&Yacute;", 221 },
209  { "&THORN;", 222 },
210  { "&szlig;", 223 },
211  { "&agrave;", 224 },
212  { "&aacute;", 225 },
213  { "&acirc;", 226 },
214  { "&atilde;", 227 },
215  { "&auml;", 228 },
216  { "&aring;", 229 },
217  { "&aelig;", 230 },
218  { "&ccedil;", 231 },
219  { "&egrave;", 232 },
220  { "&eacute;", 233 },
221  { "&ecirc;", 234 },
222  { "&euml;", 235 },
223  { "&igrave;", 236 },
224  { "&iacute;", 237 },
225  { "&icirc;", 238 },
226  { "&iuml;", 239 },
227  { "&eth;", 240 },
228  { "&ntilde;", 241 },
229  { "&ograve;", 242 },
230  { "&oacute;", 243 },
231  { "&ocirc;", 244 },
232  { "&otilde;", 245 },
233  { "&ouml;", 246 },
234  { "&divide;", 247 },
235  { "&oslash;", 248 },
236  { "&ugrave;", 249 },
237  { "&uacute;", 250 },
238  { "&ucirc;", 251 },
239  { "&uuml;", 252 },
240  { "&yacute;", 253 },
241  { "&thorn;", 254 },
242  { "&yuml;", 255 },
243  };
244  int num_tab = sizeof(tab) / sizeof(tab[0]);
245  long i, code;
246  size_t len;
247  char buf[1024];
248 
249  if (octstr_get_char(html, *pos + 1) == '#') {
250  if (octstr_get_char(html, *pos + 2) == 'x' || octstr_get_char(html, *pos + 2) == 'X')
251  i = octstr_parse_long(&code, html, *pos + 3, 16); /* hex */
252  else
253  i = octstr_parse_long(&code, html, *pos + 2, 10); /* decimal */
254  if (i > 0) {
255  if (code < 256)
256  octstr_append_char(sms, code);
257  *pos = i + 1;
258  if (octstr_get_char(html, *pos) == ';')
259  ++(*pos);
260  } else {
261  ++(*pos);
262  octstr_append_char(sms, '&');
263  }
264  } else {
265  for (i = 0; i < num_tab; ++i) {
266  len = strlen(tab[i].entity);
267  octstr_get_many_chars(buf, html, *pos, len);
268  buf[len] = '\0';
269  if (strcmp(buf, tab[i].entity) == 0) {
270  *pos += len;
271  octstr_append_char(sms, tab[i].latin1);
272  break;
273  }
274  }
275  if (i == num_tab) {
276  ++(*pos);
277  octstr_append_char(sms, '&');
278  }
279  }
280 }
281 
282 
284 {
285  long i, len;
286  int c;
287  Octstr *sms;
288 
289  sms = octstr_create("");
290  len = octstr_len(html);
291  i = 0;
292  while (i < len) {
293  c = octstr_get_char(html, i);
294  switch (c) {
295  case '<':
296  if (html_comment_begins(html, i))
297  skip_html_comment(html, &i);
298  else
299  skip_html_tag(html, &i);
300  break;
301  case '&':
302  convert_html_entity(sms, html, &i);
303  break;
304  default:
305  octstr_append_char(sms, c);
306  ++i;
307  break;
308  }
309  }
311  octstr_strip_blanks(sms);
312  return sms;
313 }
void octstr_append_char(Octstr *ostr, int ch)
Definition: octstr.c:1517
int code
Definition: smsc_cimd2.c:346
long octstr_search(const Octstr *haystack, const Octstr *needle, long pos)
Definition: octstr.c:1070
static void skip_html_tag(Octstr *html, long *pos)
Definition: html.c:100
void octstr_strip_blanks(Octstr *text)
Definition: octstr.c:1346
long octstr_search_char(const Octstr *ostr, int ch, long pos)
Definition: octstr.c:1012
Octstr * octstr_imm(const char *cstr)
Definition: octstr.c:283
static int html_comment_begins(Octstr *html, long pos)
Definition: html.c:75
int latin1
Definition: charset.c:85
#define octstr_create(cstr)
Definition: octstr.h:125
Octstr * html_to_sms(Octstr *html)
Definition: html.c:283
long octstr_len(const Octstr *ostr)
Definition: octstr.c:342
Definition: octstr.c:118
static void convert_html_entity(Octstr *sms, Octstr *html, long *pos)
Definition: html.c:129
long octstr_parse_long(long *nump, Octstr *ostr, long pos, int base)
Definition: octstr.c:749
void octstr_get_many_chars(char *buf, Octstr *ostr, long pos, long len)
Definition: octstr.c:425
int octstr_get_char(const Octstr *ostr, long pos)
Definition: octstr.c:406
void octstr_shrink_blanks(Octstr *text)
Definition: octstr.c:1433
static void skip_html_comment(Octstr *html, long *pos)
Definition: html.c:86
See file LICENSE for details about the license agreement for using, modifying, copying or deriving work from this software.