C: Linux Socket Programming, TCP, a simple HTTP client

Articles may may have files attached at the end of the post

Linux provide a high level socket API that will allow programmer to easily connect to any TCP or UDP services.

In this tutorial, we will see how this works by implementing a simple HTTP client which will get request a web page given the hostname and the page name, then read the server answer and output the HTML content of the reply.

To be able to connect to a service built on top of TCP, we first need to create a socket for the TCP protocol, fill in a network address structure representing our destination and the port to connect to and use the latter to connect to the remote server.

From there, we will be able to send and receive data over the network. Once we are done, we will close the connection.

Below is the C code for a simple HTTP client that will get the host and the page to request from the command line arguments, resolve the hostname name to an IP, connect to this IP on port 80, build the HTTP query, send it and the retrieve the page content.

  1. #include <stdio.h>
  2. #include <sys/socket.h>
  3. #include <arpa/inet.h>
  4. #include <stdlib.h>
  5. #include <netdb.h>
  6. #include <string.h>
  7. int create_tcp_socket();
  8. char *get_ip(char *host);
  9. char *build_get_query(char *host, char *page);
  10. void usage();
  11.  
  12. #define HOST "coding.debuntu.org"
  13. #define PAGE "/"
  14. #define PORT 80
  15. #define USERAGENT "HTMLGET 1.0"
  16.  
  17. int main(int argc, char **argv)
  18. {
  19.   struct sockaddr_in *remote;
  20.   int sock;
  21.   int tmpres;
  22.   char *ip;
  23.   char *get;
  24.   char buf[BUFSIZ+1];
  25.   char *host;
  26.   char *page;
  27.  
  28.   if(argc == 1){
  29.     usage();
  30.     exit(2);
  31.   }  
  32.   host = argv[1];
  33.   if(argc > 2){
  34.     page = argv[2];
  35.   }else{
  36.     page = PAGE;
  37.   }
  38.   sock = create_tcp_socket();
  39.   ip = get_ip(host);
  40.   fprintf(stderr, "IP is %s\n", ip);
  41.   remote = (struct sockaddr_in *)malloc(sizeof(struct sockaddr_in *));
  42.   remote->sin_family = AF_INET;
  43.   tmpres = inet_pton(AF_INET, ip, (void *)(&(remote->sin_addr.s_addr)));
  44.   if( tmpres < 0)  
  45.   {
  46.     perror("Can't set remote->sin_addr.s_addr");
  47.     exit(1);
  48.   }else if(tmpres == 0)
  49.   {
  50.     fprintf(stderr, "%s is not a valid IP address\n", ip);
  51.     exit(1);
  52.   }
  53.   remote->sin_port = htons(PORT);
  54.  
  55.   if(connect(sock, (struct sockaddr *)remote, sizeof(struct sockaddr)) < 0){
  56.     perror("Could not connect");
  57.     exit(1);
  58.   }
  59.   get = build_get_query(host, page);
  60.   fprintf(stderr, "Query is:\n<<START>>\n%s<<END>>\n", get);
  61.  
  62.   //Send the query to the server
  63.   int sent = 0;
  64.   while(sent < strlen(get))
  65.   {
  66.     tmpres = send(sock, get+sent, strlen(get)-sent, 0);
  67.     if(tmpres == -1){
  68.       perror("Can't send query");
  69.       exit(1);
  70.     }
  71.     sent += tmpres;
  72.   }
  73.   //now it is time to receive the page
  74.   memset(buf, 0, sizeof(buf));
  75.   int htmlstart = 0;
  76.   char * htmlcontent;
  77.   while((tmpres = recv(sock, buf, BUFSIZ, 0)) > 0){
  78.     if(htmlstart == 0)
  79.     {
  80.       /* Under certain conditions this will not work.
  81.       * If the \r\n\r\n part is splitted into two messages
  82.       * it will fail to detect the beginning of HTML content
  83.       */
  84.       htmlcontent = strstr(buf, "\r\n\r\n");
  85.       if(htmlcontent != NULL){
  86.         htmlstart = 1;
  87.         htmlcontent += 4;
  88.       }
  89.     }else{
  90.       htmlcontent = buf;
  91.     }
  92.     if(htmlstart){
  93.       fprintf(stdout, htmlcontent);
  94.     }
  95.  
  96.     memset(buf, 0, tmpres);
  97.   }
  98.   if(tmpres < 0)
  99.   {
  100.     perror("Error receiving data");
  101.   }
  102.   free(get);
  103.   free(remote);
  104.   free(ip);
  105.   close(sock);
  106.   return 0;
  107. }
  108.  
  109. void usage()
  110. {
  111.   fprintf(stderr, "USAGE: htmlget host [page]\n\
  112. \thost: the website hostname. ex: coding.debuntu.org\n\
  113. \tpage: the page to retrieve. ex: index.html, default: /\n");
  114. }
  115.  
  116.  
  117. int create_tcp_socket()
  118. {
  119.   int sock;
  120.   if((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0){
  121.     perror("Can't create TCP socket");
  122.     exit(1);
  123.   }
  124.   return sock;
  125. }
  126.  
  127.  
  128. char *get_ip(char *host)
  129. {
  130.   struct hostent *hent;
  131.   int iplen = 15; //XXX.XXX.XXX.XXX
  132.   char *ip = (char *)malloc(iplen+1);
  133.   memset(ip, 0, iplen+1);
  134.   if((hent = gethostbyname(host)) == NULL)
  135.   {
  136.     herror("Can't get IP");
  137.     exit(1);
  138.   }
  139.   if(inet_ntop(AF_INET, (void *)hent->h_addr_list[0], ip, iplen) == NULL)
  140.   {
  141.     perror("Can't resolve host");
  142.     exit(1);
  143.   }
  144.   return ip;
  145. }
  146.  
  147. char *build_get_query(char *host, char *page)
  148. {
  149.   char *query;
  150.   char *getpage = page;
  151.   char *tpl = "GET /%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\n\r\n";
  152.   if(getpage[0] == '/'){
  153.     getpage = getpage + 1;
  154.     fprintf(stderr,"Removing leading \"/\", converting %s to %s\n", page, getpage);
  155.   }
  156.   // -5 is to consider the %s %s %s in tpl and the ending \0
  157.   query = (char *)malloc(strlen(host)+strlen(getpage)+strlen(USERAGENT)+strlen(tpl)-5);
  158.   sprintf(query, tpl, getpage, host, USERAGENT);
  159.   return query;
  160. }

To compile it, run:

$ gcc -o htmlget htmlget.c
$ ./htmlget 
USAGE: htmlget host [page]
	host: the website hostname. ex: coding.debuntu.org
	page: the page to retrieve. ex: index.html, default: /

Informative messages and errors are printed to stderr. The content of the page is printed to stdout. Thus, to save the HTML content of a page to a file, you will need to run:

$ ./htmlget coding.debuntu.org category > /tmp/page.html

Which will retrieve http://coding.debuntu.org/category

AttachmentSize
htmlget.c3.55 KB

to get an image file

i was try to get a jpg file with this code,but when i trying to open it,it show up:"Error interpreting JPEG image file (Not a JPEG file: starts with 0x48 0x54)" ,anyone can help?

greetings, terrans!

With roming! Merry Christmas!

bug

I get a crash with your code when printing the downloaded html if it contains "%" characters. The fix is to change:

  1.     if(htmlstart){
  2.       fprintf(stdout, htmlcontent);
  3.     }

to

  1.     if(htmlstart){
  2.       fprintf(stdout, "%s", htmlcontent);
  3.     }

-- Geoff

very good program,thanks

very good program,thanks

Change to HTTP/1.1

Hello, I was wondering how this might be changed to use HTTP/1.1 instead of 1.0. I tried changing this in the code however the program seemed to hang after getting the information from the server. If anyone is able to help it would be greatly appreciated.

When you change get http 1.0

When you change get http 1.0 to 1.1. It's process follow to flow control mechanisms standard. If you want use http version 1.1. Add Connection: close into code.

  1. char *build_get_query(char *host, char *page)
  2. {
  3.         char *query;
  4.         char *getpage = page;
  5.         char *tpl = "GET /%s HTTP/1.1\r\nHost: %s\r\nUser-Agent: %s\r\nConnection: close\r\n\r\n";
  6.        
  7.         if(getpage[0] == '/'){
  8.                 getpage = getpage + 1;