/*
 * Copyright (C) 1997 - 2001 Loic Dachary
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#include <errno.h>
#include <signal.h>
#include <string.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#if TIME_WITH_SYS_TIME
# include <sys/time.h>
# include <time.h>
#else
# if HAVE_SYS_TIME_H
#  include <sys/time.h>
# else
#  include <time.h>
# endif
#endif
#ifdef HAVE_SOCKS_H
#include <socks.h>
#endif /* HAVE_SOCKS_H */
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <khash.h>
#include <getopttools.h>
#include <uri.h>
#include <salloc.h>
#include <strshift.h>

#include <webtools.h>
#include <crawlsig.h>

#define file_state_save \
  file->buffer = buffer; \
  file->buffer_size = buffer_size; \
  file->buffer_length = code < 0 ? 0 : buffer_length;

#define file_state_restore \
  char* buffer = file->buffer; \
  int buffer_size = file->buffer_size; \
  int buffer_length = file->buffer_length;

static int verbose = 0;

static int webtools_reader_http_header(webtools_params_t* params, int sd, struct webtools_file* file);
static int webtools_reader_http_body(webtools_params_t* params, int sd, struct webtools_file* file);
static int webtools_open_1(webtools_params_t* params, struct in_addr server, short port);
static webtools_params_t* params_alloc();
static struct webtools_file* file(webtools_params_t* params, int sd);

static struct option long_options[] =
{
  /* These options set a flag. */
  {"verbose_webtools", 0, &verbose, 1},
  {"webtools_limit", 1, 0, 0},
  {"http_level", 1, 0, 0},
  {0, 0, 0, WEBTOOLS_OPTIONS}
};

static struct option_help long_options_help[] =
{
  /* These options set a flag. */
  {"verbose_webtools", "network connection related messages."},
  {"webtools_limit", "maximum total size of bytes read from input."},
  {"http_level", "HTTP level (1.0 or 1.1)"},
  {"0", ""}
};

struct sigaction action_timeout_firewall;

struct option* webtools_options(struct option [])
{
  return long_options;
}

struct option_help* webtools_help_options(struct option_help [])
{
  return long_options_help;
}

webtools_params_t* webtools_alloc(int argc, char** argv, struct option options[])
{
  webtools_params_t* params = params_alloc();

  params->size_limit = WEBTOOLS_SIZE_LIMIT;

  opterr = 0;
  optind = 0;
  while(1) {
    /* `getopt_long' stores the option index here. */
    int option_index = 0;
    int c;
    int found = 1;

    c = getopt_long_only(argc, argv, "-", options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;
     
    switch (c)
      {
      case 0:
	/* If this option set a flag, do nothing else now. */
	
	if (options[option_index].flag != 0)
	  break;
	if(!strcmp(options[option_index].name, "http_level")) {
	  params->http_level = strdup(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "webtools_limit")) {
	  params->size_limit = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "port")) {
	  break;
	}
	found = 0;
	break;
      default:
	found = 0;
	break;
      }
    if(found) {
      hash_alloc_insert(params->options, strdup(options[option_index].name), strdup(optarg ? optarg : " "));
    }
  }

  return params;
}

void hand_timeout_firewall(int ) {
  
}

void webtools_free(webtools_params_t* params)
{
  _K(hash_free)(params->options);
  _K(hash_free)(params->host2ip);
  free(params);
}

static void hnode_free(hnode_t *node, void *)
{
  free(node->key);
  free(node->data);
  free(node);
}

int webtools_open(webtools_params_t* params, char* host, char* port)
{
  /*
   * Just clear everything if above limit and restart from scratch. LRU
   * would be smarter.
   */
  if(hash_count(params->host2ip) > 1000) {
    _K(hash_free)(params->host2ip);
    params->host2ip = hash_create(HASHCOUNT_T_MAX, 0, 0);
    hash_set_allocator(params->host2ip, 0, hnode_free, 0);
  }

  struct in_addr* ip = 0;
  hnode_t* node = hash_lookup(params->host2ip, host);
  if(node) {
    /*
     * Found in cache, reuse.
     */
    ip = (struct in_addr*)hnode_get(node);
  } else {
    /*
     * Not found in cache, do a lookup and push in cache.
     */
    struct hostent* hostent;
  
    if((hostent = gethostbyname(host)) == 0) {
      if(verbose) fprintf(stderr, "webtools_open: could not get hostname IP for %s\n", host);
      errno = EADDRNOTAVAIL;
      return -1;
    }

    ip = (struct in_addr*)malloc(sizeof(struct in_addr));
    *ip = *(struct in_addr*)hostent->h_addr_list[0];

    hash_alloc_insert(params->host2ip, strdup(host), ip);
  }

  return webtools_open_1(params, *ip, atoi(port));
}

int webtools_write(webtools_params_t* , int sd, char* buffer, int size)
{
  return write(sd, buffer, size);
}

int webtools_reader(webtools_params_t* params, int sd)
{
  int code;
  struct webtools_file* f = file(params, sd);
  if(verbose) fprintf(stderr, "webtools_reader: now using 0x%x for fd = %d\n", (int)f, sd);
  switch(params->mode) {
  case WEBTOOLS_READER_HTTP_HEADER:
    code = webtools_reader_http_header(params, sd, f);
    break;
  case WEBTOOLS_READER_HTTP_BODY:
    code = webtools_reader_http_body(params, sd, f);
    break;
  default:
    fprintf(stderr, "webtools_reader: 0x%x reader unknown\n", params->mode);
    code = WEBTOOLS_READER_UNKNOWN;
    break;
  }
  return code;
}

void webtools_close(webtools_params_t* params, int sd)
{
  struct webtools_file* f = file(params, sd);
  if(verbose) fprintf(stderr, "webtools_close: now freeing 0x%x for fd = %d\n", (int)f, sd);
  if(f) {
    if(f->buffer) free(f->buffer);
    f->buffer_length = 0;
    f->buffer_size = 0;
  }
  close(sd);
}

#define BUFFER_SIZE 10240 

int read_timeout(int fd, char* buffer, int buffer_size, int timeout)
{
  fd_set readfds;
  struct timeval t;

  FD_ZERO(&readfds);
  FD_SET(fd, &readfds);

  t.tv_sec = timeout;
  t.tv_usec = 0;

  if(verbose) fprintf(stderr, "reading with %d timeout\n", timeout);

  {
    int ret = select(fd + 1, &readfds, 0, 0, &t);
    if(ret == 1) {
      return read(fd, buffer, buffer_size);
    } else if(ret == 0) {
      errno = ETIMEDOUT;
      return -1;
    } else {
      return -1;
    }
  }
}

static int webtools_reader_http_header(webtools_params_t* params, int sd, struct webtools_file* file)
{
  file_state_restore
  char* p;
  int code = WEBTOOLS_READER_OK;
  int done = 0;
  int saw_interruption = 0;

  params->callback_http_header(params->callback_arg, sd, buffer, 0, WEBTOOLS_READER_START);

  while(!done) {
    int bytes_read;
    static_alloc(&buffer, &buffer_size, buffer_length + BUFFER_SIZE + 1);
    p = buffer + buffer_length;
    if((bytes_read = read_timeout(sd, p, BUFFER_SIZE, params->timeout)) <= 0) {
      if(bytes_read == 0) {
	code = WEBTOOLS_READER_HTTP_HEADER_SHORT;
	done = 1;
      } else {
	/*
	 * Interrupted by user signal, resume.
	 */
	if(errno == EINTR && !saw_interruption && interrupted) {
	  saw_interruption = 1;
	} else {
	  fprintf(stderr, "reading socket (header) : %d\n", errno);
	  code = -1;
	  done = 1;
	}
      }
    }

    if(!done && bytes_read > 0) {
      int i;
      if(verbose) fprintf(stderr, "webtools_reader_http_header: got %.*s\n", bytes_read, p);
      /*
       * Find header end if present.
       */
      for(i = 0; i < buffer_length + bytes_read - 1; i++) {
	if((buffer[i] == '\n' && buffer[i + 1] == '\n') ||
	   (i < buffer_length + bytes_read - 3 &&
	    (buffer[i] == '\r' && buffer[i + 1] == '\n' &&
	     buffer[i + 2] == '\r' && buffer[i + 3] == '\n'))) {
	  done = 1;
	  break;
	}
      }

      if(!done) {
	p += bytes_read;
	buffer_length += bytes_read;
	/*
	 * Truncating is not considered as an error. The parser must
	 * be able to deal with the existing data.
	 */
	if(buffer_length > params->size_limit) {
	  if(verbose) fprintf(stderr, "webtools_reader_http_header: trunc because input > %d (change limit with -webtools_limit <size in bytes> option)", params->size_limit);
	  done = 1;
	}
      } else {
	int total_length = buffer_length + bytes_read;
	int remain;
	i += (buffer[i] == '\n' ? 2 : 4);
	remain = total_length - i;
	/*
	 * - 2 so that the null will override the last \n or \r instead
	 * of the first character of the following text.
	 */
	buffer_length = i - 2;
	buffer[buffer_length] = '\0';
	if(verbose) fprintf(stderr, "webtools_reader_http_header: %.*s\n", buffer_length, buffer);
	params->callback_http_header(params->callback_arg, sd, buffer, buffer_length, WEBTOOLS_READER_END);
	strshift(buffer, total_length, total_length - remain);
	buffer_length = remain;
      }
    }
  }

  file_state_save
  return code;
}

static int webtools_reader_http_body(webtools_params_t* params, int sd, struct webtools_file* file)
{
  file_state_restore
  char* p;
  int code = WEBTOOLS_READER_OK;
  int total_read = buffer_length;
  int done = 0;
  int saw_interruption = 0;

  if(verbose) fprintf(stderr, "buffer_length = %d\n", buffer_length);
  static_alloc(&buffer, &buffer_size, buffer_length + BUFFER_SIZE + 1);
  p = buffer + buffer_length;

  params->callback_http_body(params->callback_arg, sd, buffer, 0, WEBTOOLS_READER_START);

  while(!done) {
    int bytes_read;
    if((bytes_read = read_timeout(sd, p, BUFFER_SIZE, params->timeout)) <= 0) {
      if(bytes_read == 0) {
	params->callback_http_body(params->callback_arg, sd, buffer, buffer_length, WEBTOOLS_READER_END);
	done = 1;
      } else {
	/*
	 * Interrupted by user signal, resume.
	 */
	if(errno == EINTR && !saw_interruption && interrupted) {
	  saw_interruption = 1;
	} else {
	  fprintf(stderr, "reading socket (body) : %d\n", errno);
	  code = -1;
	  done = 1;
	}
      }
    }

    if(!done && bytes_read > 0) {
      buffer_length += bytes_read;
      total_read += bytes_read;
      if(verbose) fprintf(stderr, "buffer_length = %d\n", buffer_length);
      params->callback_http_body(params->callback_arg, sd, buffer, buffer_length, WEBTOOLS_READER_CONTINUE);
      buffer_length = 0;
      p = buffer;

      if(total_read > params->size_limit) {
	if(verbose) fprintf(stderr, "webtools_reader_http_body: trunc because input > %d (change limit with -webtools_limit <size in bytes> option)", params->size_limit);
	code = WEBTOOLS_READER_TRUNC;
	done = 1;
      }
    }
  }

  file_state_save
  return code;
}

#define TIMEOUT_FIREWALL 60

static int webtools_open_1(webtools_params_t* , struct in_addr server, short port)
{
  struct sockaddr_in sin;
  int sd;

  if ((sd = socket(AF_INET,SOCK_STREAM,IPPROTO_TCP)) == -1) {
    fprintf(stderr, "could not open socket\n");
    perror("");
    exit(1);
  }

  action_timeout_firewall.sa_handler=hand_timeout_firewall;
  sigaction(SIGALRM, &action_timeout_firewall, NULL);

  memset(&sin, '\0', sizeof(sin));
  sin.sin_family = AF_INET ;
  memcpy((char*)&sin.sin_addr.s_addr, &server, sizeof(struct in_addr));
  sin.sin_port = htons(port);
  alarm(TIMEOUT_FIREWALL);
  if(connect(sd,(struct sockaddr *) &sin, sizeof(sin)) == -1) {
    switch(errno) {
    case ECONNREFUSED:
    case ETIMEDOUT:
    case EHOSTDOWN:
    case EHOSTUNREACH:
    case EINTR:
      close(sd);
      alarm(0);
      return -1;
      break;
    }
    fprintf(stderr, "webtools_open_1: could not connect\n");
    perror("");
    exit(1);
  }
  alarm(0);

  if(verbose) fprintf(stderr, "connected\n");

  return sd;
}

static webtools_params_t* params_alloc()
{
  webtools_params_t* params = (webtools_params_t*)smalloc(sizeof(webtools_params_t));
  memset((char*)params, '\0', sizeof(webtools_params_t));
  params->options = hash_create(33, 0, 0);
  hash_set_allocator(params->options, 0, hnode_free, 0);
  params->host2ip = hash_create(HASHCOUNT_T_MAX, 0, 0);
  hash_set_allocator(params->host2ip, 0, hnode_free, 0);

  return params;
}

static struct webtools_file* file(webtools_params_t* params, int sd)
{
  if(sd < WEBTOOLS_MAX_FD) {
    return &(params->files[sd]);
  } else {
    fprintf(stderr, "webtools_file: descriptor too high %d (max %d)\n", sd, WEBTOOLS_MAX_FD);
    return 0;
  }
}
