]>
git.ipfire.org Git - people/ms/strongswan.git/blob - doc/utils/html2four.c
2 extract headers from HTML files
3 in format suitable for turning into permuted index
12 maximum sizes for input line and for name in <a> tag
19 all return 0 for OK, 1 for errors
21 int do_file( char *, FILE * ) ;
22 int parse_line( char * ) ;
23 int print_line( char *, char *) ;
24 int print_header_problem( char * ) ;
33 int main(int argc
, char* argv
[])
36 int temp
, done
, status
;
46 if( isdigit(p
[1]) && p
[2] == '\0' ) {
47 max_level
= p
[1] - 0 ;
51 else die("unknown option") ;
56 if( (status
= do_file("STDIN", stdin
)) == 0 )
61 printf("ARGC = %d\n", argc ) ;
66 printf("ARGV P %s %s\n", *argv, p) ;
69 fprintf(stderr
, "%s: null filename pointer\n", prog_name
) ;
72 else if( (fp
= fopen(p
,"r")) == NULL
) {
73 fprintf(stderr
, "%s: cannot open file %s\n", prog_name
, p
) ;
77 if( (temp
= do_file(p
, fp
)) != 0 )
87 printf("%s: %d files processed, %d with errors\n", prog_name, done, status) ;
89 return( status
? 1 : 0 ) ;
92 void die( char *message
)
95 fprintf(stderr
, "%s: %s\n", prog_name
, message
) ;
99 int header_flags
[10] ;
102 char buffer
[MAX_LINE
+1] ;
103 char label
[MAX_NAME
+1] ;
105 int do_file( char *file
, FILE *fp
)
107 int i
, status
, x
, y
;
113 for( i
= 0 ; i
< 10 ; i
++ )
114 header_flags
[i
] = 0 ;
115 current_file
= file
;
117 while( base
= fgets(buffer
, MAX_LINE
, fp
) ) {
118 // count < and > characters in line
119 for( x
= y
= 0, p
= base
; *p
; p
++ )
130 // skip line if no < or >
131 if( x
== 0 && y
== 0 )
133 // report error for unequal count
135 if( strncmp( base
, "<!--", 4) && strncmp(base
, "-->", 3) ) {
137 fprintf(stderr
, "%s in file %s: unequal < > counts %d %d\n",
138 prog_name
, file
, x
, y
) ;
139 fprintf(stderr
, "%s: %s\n", prog_name
, base
) ;
145 // parse lines containing tags
147 if( parse_line(base
) )
149 // check that header labelling is sane
150 for( i
= x
= y
= 0 ; i
< 10 ; i
++ ) {
151 // count non-zero entries
152 if( x
= header_flags
[i
] )
154 // should be in 0 or 1 headers at a time
164 int parse_line( char *data
)
170 for( end
= data
; *end
; end
++ )
172 // trim off trailing returns or newlines
173 for( p
= end
- 1, q
= end
; q
> data
; p
--,q
-- ) {
180 break ; // out of switch()
182 break ; // out of for()
187 // find tag delimiters
189 for( q
= p
+ 1 ; *q
; q
++ )
190 if( *q
== '<' || *q
== '>' )
192 // if we find another '<'
193 // restart tag search from it
198 // "<>" is not interesting
201 fprintf(stderr
, "%s: null tag\n", prog_name
) ;
202 fprintf(stderr
, "%s: line\n", prog_name
, data
) ;
207 // ignore delimiters once found
210 // p points to tag contents, null terminated
212 // save contents of <a name= > tags
216 (p
[2] == 'n' || p
[2] == 'N') &&
217 (p
[3] == 'a' || p
[3] == 'A') &&
218 (p
[4] == 'm' || p
[4] == 'M') &&
219 (p
[5] == 'e' || p
[5] == 'E') &&
221 strncpy(label
, p
+ 7, MAX_NAME
) ;
225 if( in_header
&& strlen(p
) == 2 &&
226 (p
[1] == 'r' || p
[1] == 'R') )
232 if( strlen(p
) == 2 && isdigit(p
[1]) ) {
234 fprintf(stderr
, "%s: bad header nesting in %s\n",
235 prog_name
, current_file
) ;
239 printf("%s\t%s\tH%d\t", current_file
, label
, x
) ;
242 // only care about end-of-header
248 if( strlen(p
) == 2 && isdigit(p
[1]) ) {
250 fprintf(stderr
, "%s: bad header nesting in %s\n",
251 prog_name
, current_file
) ;
260 // uninteresting tag, look for next
264 // tag done, point p beyond it
267 else if( in_header
) {
268 if( isprint(*p
) && *p
!= '\n' )
280 int print_line( char *tag
, char *text
)
282 printf("%%s\ts\t%s\t%s\t\n", current_file
, label
, tag
, text
) ;
286 int print_header_problem( char *file
)
290 fprintf(stderr
, "%s: HEADER TAG PROBLEM in file %s\n", prog_name
, file
) ;
291 fprintf(stderr
, "%s: counts", prog_name
) ;
292 for ( i
= 0 ; i
< 10 ; i
++ )
293 fprintf(stderr
, "\t%d", i
) ;
294 fprintf(stderr
,"\n") ;