html2dic.c 2.42 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
/*
 * DIRB
 *
 * html2dic.c - Genera un diccionario a partir de una pagina HTML
 * Ultima modificacion: 31/03/2005
 *
 * Idea de Warezzman, coded por Darkraver
 *
 */


// (!) Aadir soporte para html en unicode

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(int argc, char **argv) {
  char uno;
  int in_tag=0;
  int in_coded=0;
  int in_word=0;
  char buffer[1024];
  FILE *fd;
  char word[]="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_";

  memset(buffer, 0, 1024);

  if(argc!=2) {
	printf("Uso: ./html2dic <file>\n");
	exit(-1); }

// Abriendo fichero

  fd=fopen(argv[1], "r");
  if(fd<=0) {
	perror("fopen");
	exit(-1); }

// Bucle de lectura de fichero -----------------------------------------------

  while(fread(&uno, 1, 1, fd)) {

    if(uno=='<') { in_tag=1; in_word=0; }

    if(uno=='&') in_coded=1;

    // Estamos en el texto

    if(!in_tag && !in_coded && uno!='\0') {
	  if(strchr(word, uno)) {
		if(!in_word) putchar('\n');
		in_word=1;
		putchar(uno);
	    }
      else in_word=0;
	  }

    // Analisis del tag html

    if(uno=='>') in_tag=0;

    // Analisis del caracter codificado

    if(in_coded && strlen(buffer)<1023) strncat(buffer, &uno, 1);

    if(uno==';') {
	  //printf("\n[ CODE: %s ]\n", buffer);
	  /*
	  if(strcmp(buffer, "&copy;")==0) putchar('');
	  if(strcmp(buffer, "&#8216;")==0) putchar('');
	  if(strcmp(buffer, "&#8217;")==0) putchar('');
	  if(strcmp(buffer, "&quot;")==0) putchar('\"');
	  if(strcmp(buffer, "&nbsp;")==0) putchar(' ');
	  if(strcmp(buffer, "&amp;")==0) putchar('&');
	  if(strcmp(buffer, "&lt;")==0) putchar('<');
	  if(strcmp(buffer, "&gt;")==0) putchar('>');
	  */
	  if(strcmp(buffer, "&ntilde;")==0) putchar('');
	  if(strcmp(buffer, "&aacute;")==0) putchar('');
	  if(strcmp(buffer, "&eacute;")==0) putchar('');
	  if(strcmp(buffer, "&iacute;")==0) putchar('');
	  if(strcmp(buffer, "&oacute;")==0) putchar('');
	  if(strcmp(buffer, "&uacute;")==0) putchar('');
	  if(strcmp(buffer, "&Aacute;")==0) putchar('');
	  if(strcmp(buffer, "&Eacute;")==0) putchar('');
	  if(strcmp(buffer, "&Iacute;")==0) putchar('');
	  if(strcmp(buffer, "&Oacute;")==0) putchar('');
	  if(strcmp(buffer, "&Uacute;")==0) putchar('');
	  if(strcmp(buffer, "&nbsp;")==0) in_word=0;
	  in_coded=0;
	  memset(buffer, 0, 1024);
      }

  }

// ---------------------------------------------------------------------------

  fclose(fd);

  exit(0);

}