You are on page 1of 134

Pkt Sniffer Code (Read pkts only) using AF_PACKET Linux Sockets

In the previous part we made a simple sniffer which created a raw socket and started receiving on it. But it had few drawbacks:
1. Could sniff only incoming data.
2. Could sniff only TCP or UDP or ICMP or any one protocol packets at a time.
3. Provided IP frames, so Ethernet headers were not available.
In this article we are going to modify the same code to fix the above 3 drawbacks. However we shall not be using libpcap. This will
be done using pure Linux sockets. The difference is very small and is 2 lines :
Instead of :
sock_raw = socket(AF_INET , SOCK_RAW , IPPROTO_TCP);
1
We do :
sock_raw = socket( AF_PACKET , SOCK_RAW , htons(ETH_P_ALL)) ;
//Optional
//setsockopt(sock_raw , SOL_SOCKET , SO_BINDTODEVICE , "eth0" , strlen("eth0")+ 1 );

1
2
3

and we are done.

Now it will:
1. Sniff both incoming and outgoing traffic.
2. Sniff ALL ETHERNET FRAMES, which includes all kinds of IP packets and even more if there are any.
3. Provides the Ethernet headers too, which contain the mac addresses.
The setsockopt line is optional. Its important to provide the correct interface name to setsockopt , eth0 in this case and in most cases.
So may be you would like to present the user with a list of interfaces available and allow him to choose the one to be sniffed.

AGAM NOTE: Can we use this method with the adapter in promiscuous mode and capture everything on the wire? Libpcap
will do this but can this method do it?

Here is the full source code:


#include<netinet/in.h>
#include<errno.h>
#include<netdb.h>
#include<stdio.h> //For standard things
#include<stdlib.h>
//malloc
#include<string.h>
//strlen
#include<netinet/ip_icmp.h>
#include<netinet/udp.h>
#include<netinet/tcp.h>
#include<netinet/ip.h>
#include<netinet/if_ether.h>
#include<net/ethernet.h>
#include<sys/socket.h>
#include<arpa/inet.h>
#include<sys/ioctl.h>
#include<sys/time.h>
#include<sys/types.h>
#include<unistd.h>
void
void
void
void

//Provides declarations
//Provides declarations
//Provides declarations
//Provides declarations
//For ETH_P_ALL
//For ether_header

ProcessPacket(unsigned char* ,
print_ip_header(unsigned char*
print_tcp_packet(unsigned char
print_udp_packet(unsigned char

int);
, int);
* , int );
* , int );

for
for
for
for

icmp header
udp header
tcp header
ip header

void print_icmp_packet(unsigned char* , int );


void PrintData (unsigned char* , int);
FILE *logfile;
struct sockaddr_in source,dest;
int tcp=0,udp=0,icmp=0,others=0,igmp=0,total=0,i,j;
int main()
{
int saddr_size , data_size;
struct sockaddr saddr;
unsigned char *buffer = (unsigned char *) malloc(65536); //Its Big!
logfile=fopen("log.txt","w");
if(logfile==NULL)
{
printf("Unable to create log.txt file.");
}
printf("Starting...\n");
int sock_raw = socket( AF_PACKET , SOCK_RAW , htons(ETH_P_ALL)) ;
//setsockopt(sock_raw , SOL_SOCKET , SO_BINDTODEVICE , "eth0" , strlen("eth0")+ 1 );

if(sock_raw < 0) {
//Print the error with proper message
perror("Socket Error");
return 1;
}
while(1) {
saddr_size = sizeof saddr;
//Receive a packet
data_size = recvfrom(sock_raw , buffer , 65536 , 0 , &saddr ,
(socklen_t*) &saddr_size);
if(data_size <0 ) {
printf("Recvfrom error , failed to get packets\n");
return 1;
}
//Now process the packet
ProcessPacket(buffer , data_size);
}
close(sock_raw);
printf("Finished");
return 0;

void ProcessPacket(unsigned char* buffer, int size)


{
//Get the IP Header part of this packet , excluding the ethernet header
struct iphdr *iph = (struct iphdr*)(buffer + sizeof(struct ethhdr));
++total;
switch (iph->protocol) //Check the Protocol and do accordingly...
{
case 1: //ICMP Protocol
++icmp;
print_icmp_packet( buffer , size);
break;
case 2: //IGMP Protocol
++igmp;
break;
case 6:

//TCP Protocol

++tcp;
print_tcp_packet(buffer , size);
break;
case 17: //UDP Protocol
++udp;
print_udp_packet(buffer , size);
break;
default: //Some Other Protocol like ARP etc.
++others;
break;
}
printf("TCP : %d
UDP : %d
ICMP : %d
IGMP : %d
, udp ,
icmp , igmp , others , total);
}

Others : %d

Total : %d\r", tcp

void print_ethernet_header(unsigned char* Buffer, int Size)


{
struct ethhdr *eth = (struct ethhdr *)Buffer;

fprintf(logfile , "\n");
fprintf(logfile , "Ethernet Header\n");
fprintf(logfile , "
|-Destination Address : %.2X-%.2X-%.2X-%.2X-%.2X-%.2X \n",
eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],
eth->h_dest[5] );
fprintf(logfile , "
|-Source Address
: %.2X-%.2X-%.2X-%.2X-%.2X-%.2X \n",
eth->h_source[0] ,
eth->h_source[1] , eth->h_source[2] , eth->h_source[3] , eth->h_source[4] ,
eth->h_source[5] );
fprintf(logfile , "
|-Protocol
: %u \n",(unsigned short)eth->h_proto);

void print_ip_header(unsigned char* Buffer, int Size)


{
print_ethernet_header(Buffer , Size);
unsigned short iphdrlen;
struct iphdr *iph = (struct iphdr *)(Buffer
iphdrlen =iph->ihl*4;

+ sizeof(struct ethhdr) );

memset(&source, 0, sizeof(source));
source.sin_addr.s_addr = iph->saddr;
memset(&dest, 0, sizeof(dest));
dest.sin_addr.s_addr = iph->daddr;
fprintf(logfile , "\n");
fprintf(logfile , "IP Header\n");
fprintf(logfile , "
|-IP Version
: %d\n",(unsigned int)iph->version);
fprintf(logfile , "
|-IP Header Length : %d DWORDS or %d Bytes\n",
(unsigned int)iph->ihl, ((unsigned int)(iph->ihl))*4);
fprintf(logfile , "
|-Type Of Service
: %d\n",(unsigned int)iph->tos);
fprintf(logfile , "
|-IP Total Length
: %d Bytes(Size of Packet)\n",
ntohs(iph->tot_len));
fprintf(logfile , "
|-Identification
: %d\n",ntohs(iph->id));
//fprintf(logfile , " |-Reserved ZERO Field
: %d\n",
(unsigned int)iphdr>ip_reserved_zero);
//fprintf(logfile , " |-Dont Fragment Field
: %d\n",
(unsigned int)iphdr->ip_dont_fragment);

//fprintf(logfile , " |-More Fragment Field


: %d\n",
(unsigned int)iphdr->ip_more_fragment);
fprintf(logfile , "
|-TTL
: %d\n",(unsigned int)iph->ttl);
fprintf(logfile , "
|-Protocol : %d\n",(unsigned int)iph->protocol);
fprintf(logfile , "
|-Checksum : %d\n",ntohs(iph->check));
fprintf(logfile , "
|-Source IP
: %s\n",inet_ntoa(source.sin_addr));
fprintf(logfile , "
|-Destination IP
: %s\n",inet_ntoa(dest.sin_addr));

void print_tcp_packet(unsigned char* Buffer, int Size)


{
unsigned short iphdrlen;
struct iphdr *iph = (struct iphdr *)( Buffer
iphdrlen = iph->ihl*4;

+ sizeof(struct ethhdr) );

struct tcphdr *tcph=(struct tcphdr*)(Buffer + iphdrlen + sizeof(struct ethhdr));


int header_size =

sizeof(struct ethhdr) + iphdrlen + tcph->doff*4;

fprintf(logfile , "\n\n**********************TCP Packet*************************\n");


print_ip_header(Buffer,Size);
fprintf(logfile , "\n");
fprintf(logfile , "TCP Header\n");
fprintf(logfile , "
|-Source Port
: %u\n",ntohs(tcph->source));
fprintf(logfile , "
|-Destination Port : %u\n",ntohs(tcph->dest));
fprintf(logfile , "
|-Sequence Number
: %u\n",ntohl(tcph->seq));
fprintf(logfile , "
|-Acknowledge Number : %u\n",ntohl(tcph->ack_seq));
fprintf(logfile , "
|-Header Length
: %d DWORDS or %d BYTES\n" ,
(unsigned int)tcph->doff,(unsigned int)tcph->doff*4);
//fprintf(logfile , "
|-CWR Flag : %d\n",(unsigned int)tcph->cwr);
//fprintf(logfile , "
|-ECN Flag : %d\n",(unsigned int)tcph->ece);
fprintf(logfile , "
|-Urgent Flag
: %d\n",(unsigned int)tcph->urg);
fprintf(logfile , "
|-Acknowledgement Flag : %d\n",(unsigned int)tcph->ack);
fprintf(logfile , "
|-Push Flag
: %d\n",(unsigned int)tcph->psh);
fprintf(logfile , "
|-Reset Flag
: %d\n",(unsigned int)tcph->rst);
fprintf(logfile , "
|-Synchronise Flag
: %d\n",(unsigned int)tcph->syn);
fprintf(logfile , "
|-Finish Flag
: %d\n",(unsigned int)tcph->fin);
fprintf(logfile , "
|-Window
: %d\n",ntohs(tcph->window));
fprintf(logfile , "
|-Checksum
: %d\n",ntohs(tcph->check));
fprintf(logfile , "
|-Urgent Pointer : %d\n",tcph->urg_ptr);
fprintf(logfile , "\n");
fprintf(logfile , "
DATA Dump
");
fprintf(logfile , "\n");
fprintf(logfile , "IP Header\n");
PrintData(Buffer,iphdrlen);
fprintf(logfile , "TCP Header\n");
PrintData(Buffer+iphdrlen,tcph->doff*4);
fprintf(logfile , "Data Payload\n");
PrintData(Buffer + header_size , Size - header_size );
}

fprintf(logfile , "\n###########################################################");

void print_udp_packet(unsigned char *Buffer , int Size)


{

unsigned short iphdrlen;


struct iphdr *iph = (struct iphdr *)(Buffer +
iphdrlen = iph->ihl*4;

sizeof(struct ethhdr));

struct udphdr *udph = (struct udphdr*)(Buffer + iphdrlen


int header_size =

+ sizeof(struct ethhdr));

sizeof(struct ethhdr) + iphdrlen + sizeof udph;

fprintf(logfile , "\n\n**********************UDP Packet*************************\n");


print_ip_header(Buffer,Size);
fprintf(logfile
fprintf(logfile
fprintf(logfile
fprintf(logfile
fprintf(logfile

,
,
,
,
,

"\nUDP Header\n");
"
|-Source Port
"
|-Destination Port
"
|-UDP Length
"
|-UDP Checksum

:
:
:
:

%d\n"
%d\n"
%d\n"
%d\n"

,
,
,
,

ntohs(udph->source));
ntohs(udph->dest));
ntohs(udph->len));
ntohs(udph->check));

fprintf(logfile , "\n");
fprintf(logfile , "IP Header\n");
PrintData(Buffer , iphdrlen);
fprintf(logfile , "UDP Header\n");
PrintData(Buffer+iphdrlen , sizeof udph);
fprintf(logfile , "Data Payload\n");
//Move the pointer ahead and reduce the size of string
PrintData(Buffer + header_size , Size - header_size);
}

fprintf(logfile , "\n###########################################################");

void print_icmp_packet(unsigned char* Buffer , int Size)


{
unsigned short iphdrlen;
struct iphdr *iph = (struct iphdr *)(Buffer
iphdrlen = iph->ihl * 4;

+ sizeof(struct ethhdr));

struct icmphdr *icmph = (struct icmphdr *)(Buffer + iphdrlen +


sizeof(struct ethhdr));
int header_size =

sizeof(struct ethhdr) + iphdrlen + sizeof icmph;

fprintf(logfile , "\n\n********************ICMP Packet**********************\n");


print_ip_header(Buffer , Size);
fprintf(logfile , "\n");
fprintf(logfile , "ICMP Header\n");
fprintf(logfile , "
|-Type : %d",(unsigned int)(icmph->type));
if((unsigned int)(icmph->type) == 11) {
fprintf(logfile , " (TTL Expired)\n");
}
else if((unsigned int)(icmph->type) == ICMP_ECHOREPLY) {
fprintf(logfile , " (ICMP Echo Reply)\n");
}

fprintf(logfile ,
fprintf(logfile ,
//fprintf(logfile
//fprintf(logfile
fprintf(logfile ,

"
|-Code : %d\n",(unsigned int)(icmph->code));
"
|-Checksum : %d\n",ntohs(icmph->checksum));
, "
|-ID
: %d\n",ntohs(icmph->id));
, "
|-Sequence : %d\n",ntohs(icmph->sequence));
"\n");

fprintf(logfile , "IP Header\n");


PrintData(Buffer,iphdrlen);
fprintf(logfile , "UDP Header\n");
PrintData(Buffer + iphdrlen , sizeof icmph);
fprintf(logfile , "Data Payload\n");
//Move the pointer ahead and reduce the size of string
PrintData(Buffer + header_size , (Size - header_size) );
fprintf(logfile , "\n###########################################################");
}
void PrintData (unsigned char* data , int Size)
{
int i , j;
for(i=0 ; i < Size ; i++) {
if( i!=0 && i%16==0)
//if one line of hex printing is complete...
{
fprintf(logfile , "
");
for(j=i-16 ; j<i ; j++) {
if(data[j]>=32 && data[j]<=128)
fprintf(logfile , "%c",(unsigned char)data[j]); //if number or alphabet
else
fprintf(logfile , "."); //otherwise print a dot
}
fprintf(logfile , "\n");
}
if(i%16==0) fprintf(logfile , "
");
fprintf(logfile , " %02X",(unsigned int)data[i]);
if( i==Size-1) //print the last spaces
{
for(j=0;j<15-i%16;j++) {
fprintf(logfile , "
"); //extra spaces
}
fprintf(logfile , "

");

for(j=i-i%16 ; j<=i ; j++) {


if(data[j]>=32 && data[j]<=128) {
fprintf(logfile , "%c",(unsigned char)data[j]);
}
else {
fprintf(logfile , ".");
}
}
fprintf(logfile , "\n" );
}
}

The log file will looks somewhat like this :

***********************TCP Packet*************************

Ethernet Header

|-Destination Address : 00-25-5E-1A-3D-F1

|-Source Address

: 00-1C-C0-F8-79-EE

|-Protocol

: 8

IP Header

|-IP Version

: 4

|-IP Header Length

: 5 DWORDS or 20 Bytes

|-Type Of Service

: 0

|-IP Total Length

: 141

|-Identification

: 13122

|-TTL

Bytes(Size of Packet)

: 64

|-Protocol : 6

|-Checksum : 45952

|-Source IP

: 192.168.1.6

|-Destination IP

: 74.125.71.125

TCP Header

|-Source Port

: 33655

|-Destination Port : 5222

|-Sequence Number

: 78458457

|-Acknowledge Number : 2427066746

|-Header Length

: 5 DWORDS or 20 BYTES

|-Urgent Flag

: 0

|-Acknowledgement Flag : 1

|-Push Flag

: 1

|-Reset Flag

: 0

|-Synchronise Flag

: 0

|-Finish Flag

: 0

|-Window

: 62920

|-Checksum

: 21544

|-Urgent Pointer : 0

DATA Dump

IP Header

00 25 5E 1A 3D F1 00 1C C0 F8 79 EE 08 00 45 00

.%^.=.....y...E.

00 8D 33 42

..3B

TCP Header

40 00 40 06 B3 80 C0 A8 01 06 4A 7D 47 7D 83 77

@.@..?....J}G}.w

14 66 04 AD

.f..

Data Payload

17 03 01 00 60 A0 9C 5D 14 A1 25 AB CE 8B 7C EB

....`..]..%...|.

1A A4 43 A6 60 DD E8 6B 6E 43 C1 94 6A D2 25 23

..C.`..knC..j.%#

03 98 59 67 1A 2C 07 D3 7E B2 B8 9F 83 38 4C 69

..Yg.,..~....8Li

D3 3A 8E 0D 9E F0 6B CE 9E 6B F4 E1 BD 9E 50 53

.:....k..k....PS

6D F6 AB 11 05 D6 41 82 F0 03 0C A6 E2 48 2B 71

m.....A......H+q

16 81 FF 5B DF 50 D4 5B AD 90 04 5E 4C 94 E7 9B

...[.P.[...^L...

0B 72 7E 32 88

.r~2.

###########################################################

In the above log we can see the Ethernet headers being printed. They show the source and destination mac address along with the
packet protocol. 8 means IP protocol
Note:
1. If you want to sniff only IP and ARP packets for example then you can try this:
sock_raw = socket( AF_PACKET , SOCK_RAW , htons(ETH_P_IP|ETH_P_ARP)) ;
The complete list of protocols is found in /usr/include/linux/if_ether.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

/*
* These are the defined Ethernet Protocol ID's.
*/
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define

ETH_P_LOOP 0x0060
ETH_P_PUP
0x0200
ETH_P_PUPAT 0x0201
ETH_P_IP
0x0800
ETH_P_X25
0x0805
ETH_P_ARP
0x0806
ETH_P_BPQ
0x08FF
ETH_P_IEEEPUP
0x0a00
ETH_P_IEEEPUPAT 0x0a01
ETH_P_DEC
0x6000
ETH_P_DNA_DL
0x6001
ETH_P_DNA_RC
0x6002
ETH_P_DNA_RT
0x6003

/*
/*
/*
/*
/*
/*
/*
/*
/*
/*
/*
/*
/*

Ethernet Loopback packet */


Xerox PUP packet
*/
Xerox PUP Addr Trans packet */
Internet Protocol packet */
CCITT X.25
*/
Address Resolution packet
*/
G8BPQ AX.25 Ethernet Packet [NOT AN OFFICIALLY REGISTERED ID] */
Xerox IEEE802.3 PUP packet */
Xerox IEEE802.3 PUP Addr Trans packet */
DEC Assigned proto
*/
DEC DNA Dump/Load
*/
DEC DNA Remote Console
*/
DEC DNA Routing
*/

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
Enjoy!!

#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define

ETH_P_LAT
0x6004 /* DEC LAT
*/
ETH_P_DIAG
0x6005 /* DEC Diagnostics
*/
ETH_P_CUST
0x6006 /* DEC Customer use
*/
ETH_P_SCA
0x6007 /* DEC Systems Comms Arch
*/
ETH_P_TEB
0x6558 /* Trans Ether Bridging
*/
ETH_P_RARP
0x8035 /* Reverse Addr Res packet */
ETH_P_ATALK 0x809B
/* Appletalk DDP
*/
ETH_P_AARP 0x80F3
/* Appletalk AARP
*/
ETH_P_8021Q 0x8100
/* 802.1Q VLAN Extended Header */
ETH_P_IPX
0x8137
/* IPX over DIX
*/
ETH_P_IPV6 0x86DD
/* IPv6 over bluebook
*/
ETH_P_PAUSE 0x8808
/* IEEE Pause frames. See 802.3 31B */
ETH_P_SLOW 0x8809
/* Slow Protocol. See 802.3ad 43B */
ETH_P_WCCP 0x883E
/* Web-cache coordination protoc draft-wilson-wrec-wccp-v2-00.txt */
ETH_P_PPP_DISC 0x8863 /* PPPoE discovery messages
*/
ETH_P_PPP_SES
0x8864 /* PPPoE session messages
*/
ETH_P_MPLS_UC
0x8847 /* MPLS Unicast traffic
*/
ETH_P_MPLS_MC
0x8848 /* MPLS Multicast traffic
*/
ETH_P_ATMMPOA
0x884c /* MultiProtocol Over ATM
*/
ETH_P_LINK_CTL 0x886c /* HPNA, wlan link local tunnel */
ETH_P_ATMFATE
0x8884 /* Frame-based ATM Transport
* over Ethernet
*/
ETH_P_PAE
0x888E
/* Port Access Entity (IEEE 802.1X) */
ETH_P_AOE
0x88A2
/* ATA over Ethernet
*/
ETH_P_TIPC 0x88CA
/* TIPC
*/
ETH_P_1588 0x88F7
/* IEEE 1588 Timesync */
ETH_P_FCOE 0x8906
/* Fibre Channel over Ethernet */
ETH_P_FIP
0x8914
/* FCoE Initialization Protocol */
ETH_P_EDSA 0xDADA
/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */

/*
* Non DIX types. Won't clash for 1500 types.
*/
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define

ETH_P_802_3 0x0001
ETH_P_AX25 0x0002
ETH_P_ALL
0x0003
ETH_P_802_2 0x0004
ETH_P_SNAP 0x0005
ETH_P_DDCMP
0x0006
ETH_P_WAN_PPP
0x0007
ETH_P_PPP_MP
0x0008
ETH_P_LOCALTALK 0x0009
ETH_P_CAN
0x000C
ETH_P_PPPTALK
0x0010
ETH_P_TR_802_2 0x0011
ETH_P_MOBITEX
0x0015
ETH_P_CONTROL
0x0016
ETH_P_IRDA 0x0017
ETH_P_ECONET
0x0018
ETH_P_HDLC 0x0019
ETH_P_ARCNET
0x001A
ETH_P_DSA
0x001B
ETH_P_TRAILER
0x001C
ETH_P_PHONET
0x00F5
ETH_P_IEEE802154 0x00F6
ETH_P_CAIF 0x00F7

/*
/*
/*
/*
/*
/*
/*
/*
/*

Dummy type for 802.3 frames */


Dummy protocol id for AX.25 */
Every packet (be careful!!!) */
802.2 frames
*/
Internal only
*/
DEC DDCMP: Internal only
*/
Dummy type for WAN PPP frames*/
Dummy type for PPP MP frames */
Localtalk pseudo type
*/
/* Controller Area Network
*/
/* Dummy type for Atalk over PPP*/
/* 802.2 frames
*/
/* Mobitex (kaz@cafe.net)
*/
/* Card specific control frames */
/* Linux-IrDA
*/
/* Acorn Econet
*/
/* HDLC frames
*/
/* 1A for ArcNet :-)
*/
/* Distributed Switch Arch. */
/* Trailer switch tagging
*/
/* Nokia Phonet frames
*/
/* IEEE802.15.4 frame
*/
/* ST-Ericsson CAIF protocol
*/

Send an Eth frame using an AF_PACKET socket in C


Content

1 Objective

2 Background

3 Scenario

4 Method

4.1 Overview

4.2 Select the required EtherType

4.3 Create the AF_PACKET socket

4.4 Determine the index number of the Ethernet interface to be used

4.5 Construct the destination address

4.6 Send the Ethernet frame

4.7 Send the frame (using sendto)

4.8 Send the frame (using sendmsg)

5 Alternatives

5.1 Using libpcap

5.2 Using a raw socket

6 Further reading

Tested on

Debian (Lenny)
Ubuntu (Lucid, Trusty)

Objective
To send an arbitrary Ethernet frame using an AF_PACKET socket

Background
Ethernet is a link layer protocol. Most networking programs interact with the network stack at the
transport layer or above, so have no need to deal with Ethernet frames directly, but there are some
circumstances where interaction at a lower level may be necessary. These include:
implementation of Ethernet-based protocols that are not built in to the network stack, and
production of malformed or otherwise non-standard frames for testing purposes.

Scenario
Suppose that you wish to send an ARP request for the IP address 192.168.0.83. The request is to be
sent from interface eth0 to the broadcast MAC adddress.
(ARP is the Address Resolution Protocol. It is used when a host needs to send a datagram to a given
IP address, but does not know which MAC address corresponds to that IP address.)

Method
Overview
The method described here has five steps:
1. Select the required EtherType.
2. Create the AF_PACKET socket.
3. Determine the index number of the Ethernet interface to be used.
4. Construct the destination address.
5. Send the Ethernet frame.
The following header files are used:
Header

Used by

<errno.h>
<string.h>

errno

<arpa/inet.h>

in_addr_t, htons

<net/ethernet.h>
<net/if.h>
<netinet/if_ether.h
>
<netpacket/packet.h
>
<sys/ioctl.h>
<sys/socket.h>

ETHER_ADDR_LEN, ETH_P_*
struct ifreq

memcpy, strerror, strlen

struct ether_arp
struct sockaddr_ll
SIOCGIFINDEX, ioctl
struct sockaddr, struct iovec, struct

msghdr, AF_PACKET, SOCK_DGRAM, socket, sendto, sendmsg

sockets are specific to Linux. Programs that make use of them need elevated privileges
in order to run.
AF_PACKET

Setting SO_BROADCAST does not appear to be necessary when sending broadcast frames using
an AF_PACKET socket. Some programs do so anyway, which is unlikely to be harmful, and could be
considered a worthwhile hedge against any future change in behaviour.

Select the required EtherType


The EtherType of an Ethernet frame specifies the type of payload that it contains. There are several
sources from which EtherTypes can be obtained:
The header file <linux/if_ether.h> provides constants for most commonly-used EtherTypes.
Examples include ETH_P_IP for the Internet Protocol (0x8000), ETH_P_ARP for the Address
Resolution Protocol (0x0806) and ETH_P_8021Q for IEEE 802.1Q VLAN tags (0x8100).
The IEEE maintains the definitive list of registered EtherTypes.
A semi-official list is maintained by IANA.
The wildcard value ETH_P_ALL allows any EtherType to be received without using multiple sockets.
This includes EtherTypes that are handled by the kernel, such as IP and ARP.
If you need an EtherType for experimental or private use then the values
been reserved for that purpose.

0x88b5

and 0x88b6 have

Create the AF_PACKET socket


The socket that will be used to send the Ethernet frame should be created using the
This takes three arguments:

socket

function.

the domain (AF_PACKET for a packet socket);


the socket type (SOCK_DGRAM if you want the Ethernet header to be constructed for you
or SOCK_RAW if you want to construct it yourself); and
the protocol (equal to the Ethertype chosen above, converted to network byte order), which
is used for filtering inbound packets.
In this instance the socket will be used for sending (and presumably also receiving) ARP requests,
therefore the third argument should be set to htons(ETH_P_ARP) (or equivalently, htons(0x0806)).
There is no need to construct a custom Ethernet header so the second argument should be set
to SOCK_DGRAM:

int fd=socket(AF_PACKET,SOCK_DGRAM,htons(ETH_P_ARP));
if (fd==-1) {
die("%s",strerror(errno));
}

Determine the index number of the Ethernet interface to be used


Network interfaces are usually identified by name in user-facing contexts, but for some low-level
APIs like the one used here a number is used instead. You can obtain the index from the name by
means of the ioctl command SIOCGIFINDEX:
struct ifreq ifr;
size_t if_name_len=strlen(if_name);
if (if_name_len<sizeof(ifr.ifr_name)) {
memcpy(ifr.ifr_name,if_name,if_name_len);
ifr.ifr_name[if_name_len]=0;
} else {
die("interface name is too long");
}
if (ioctl(fd,SIOCGIFINDEX,&ifr)==-1) {
die("%s",strerror(errno));
}
int ifindex=ifr.ifr_ifindex;

For further details of this method see the microHOWTO Get the index number of a Linux network
interface in C using SIOCGIFINDEX.

Construct the destination address


To send a frame using an AF_PACKET socket its destination must be given in the form of
a sockaddr_ll structure. The fields that you need to specify
are sll_family, sll_addr, sll_halen, sll_ifindex and sll_protocol. The remainder should be
zeroed:
const unsigned char ether_broadcast_addr[]=
{0xff,0xff,0xff,0xff,0xff,0xff};
struct sockaddr_ll addr={0};
addr.sll_family=AF_PACKET;
addr.sll_ifindex=ifindex;
addr.sll_halen=ETHER_ADDR_LEN;
addr.sll_protocol=htons(ETH_P_ARP);
memcpy(addr.sll_addr,ether_broadcast_addr,ETHER_ADDR_LEN);

(At the time of writing, the manpage packet(7) stated that


only sll_family, sll_addr, sll_halen and sll_ifindex need be provided when sending. This is
incorrect. The EtherType specified when opening the socket is used for filtering inbound packets
but not for constructing outbound ones.)

Send the Ethernet frame

Frames can in principle be sent using any function that is capable of writing to a file descriptor,
however if you have opted for the link-layer header to be constructed automatically then it will be
necessary to use either sendto or sendmsg so that a destination address can be specified. Of
these sendmsg is the more flexible option, but at the cost of a significantly more complex interface.
Details of each function are given below.
Regardless of which function you choose, each function call will result in a separate datagram
being sent. For this reason you must either compose each datagram payload as a single, contiguous
block of memory, or make use of the scatter/gather capability provided by sendmsg.
In this particular scenario the payload to be sent is an ARP request. For completeness, here is an
example of how such a payload might be constructed:
struct ether_arp req;
req.arp_hrd=htons(ARPHRD_ETHER);
req.arp_pro=htons(ETH_P_IP);
req.arp_hln=ETHER_ADDR_LEN;
req.arp_pln=sizeof(in_addr_t);
req.arp_op=htons(ARPOP_REQUEST);
memset(&req.arp_tha,0,sizeof(req.arp_tha));

You will need to set req.arp_tpa to contain the IP address (in network byte order) for which you
want to find the corresponding MAC address. For example, starting from a string in dotted quad
format:
const char* target_ip_string="192.168.0.83";
struct in_addr target_ip_addr={0};
if (!inet_aton(target_ip_string,&target_ip_addr)) {
die("%s is not a valid IP address",target_ip_string);
}
memcpy(&req.arp_tpa,&target_ip_addr.s_addr,sizeof(req.arp_tpa));

You will also need to set source_ip_addr and source_hw_addr to contain the IP and MAC addresses
of the interface from which the request will be sent (in network byte order). See the
microHOWTOs Get the IP address of a network interface in C using SIOCGIFADDR and Get the
MAC address of an Ethernet interface in C using SIOCGIFHWADDR for details of how to obtain
these given the interface name.

Send the frame (using sendto)


To call sendto you must supply the content of the frame and the remote address to which it should
be sent:
if (sendto(fd,&req,sizeof(req),0,(struct sockaddr*)&addr,sizeof(addr))==-1) {
die("%s",strerror(errno));
}

The fourth argument is for specifying flags which modify the behaviour of sendto, none of which
are needed in this example.
The value returned by sendto is the number of bytes sent, or -1 if there was an
error. AF_PACKET frames are sent atomically, so unlike when writing to a TCP socket there is no need
to wrap the function call in a loop to handle partially-sent data.

Send the frame (using sendmsg)


To call sendmsg, in addition to the datagram content and remote address you must also construct
an iovec array and a msghdr structure:
struct iovec iov[1];
iov[0].iov_base=&req;
iov[0].iov_len=sizeof(req);
struct msghdr message;
message.msg_name=&addr;
message.msg_namelen=sizeof(addr);
message.msg_iov=iov;
message.msg_iovlen=1;
message.msg_control=0;
message.msg_controllen=0;
if (sendmsg(fd,&message,0)==-1) {
die("%s",strerror(errno));
}

The purpose of the iovec array is to provide a scatter/gather capability so that the datagram payload
need not be stored in a contiguous region of memory. In this example the entire payload is stored in
a single buffer, therefore only one array element is needed.
The msghdr structure exists to bring the number of arguments to recvmsg and sendmsg down to a
managable number. On entry to sendmsg it specifies where the destination address, the datagram
payload and any ancillary data are stored. In this example no ancillary data has been provided.
If you wish to pass any flags into sendmsg then this cannot be done using msg_flags, which is
ignored on entry. Instead you must pass them using the third argument to sendmsg (which is zero in
this example).

Alternatives
Using libpcap
See:

Send an arbitrary Ethernet frame using libpcap

libpcap is a cross-platform library for capturing traffic from network interfaces. It also has the
ability to send, so provides broadly the same functionality as a packet socket (and on Linux, is
implemented using a packet socket).

The main advantage of using libpcap is that it abstracts away differences between the operating
systems that it supports, thereby allowing relatively portable code to be written. This involves some
loss of functionality, and that may make libpcap unsuitable for use in some circumstances, but
otherwise it is recommended in preference to AF_PACKET sockets on the grounds of portability.

Using a raw socket


See:

Send an arbitrary IPv4 datagram using a raw socket in C

Raw sockets differ from packet sockets in that they operate at the network layer as opposed to the
link layer. For this reason they are limited to network protocols for which raw socket support has
been explicitly built into the network stack, but they also have a number of advantages which result
from operating at a higher level of abstraction:
You can write code that will work with any suitable type of network interface.
Routing and link-layer address resolution are handled for you.
The network layer header is constructed for you unless you request otherwise.
The raw socket API has been partially standardised by POSIX, whereas AF_PACKET sockets
are specific to Linux.
For these reasons, use of a raw socket is recommended unless you specifically need the extra
functionality provided by working at the link layer.

Further reading
packet(7) (Linux manpage)

Send an arbitrary IPv4 datagram using a raw socket in C


Content

1 Objective

2 Background

3 Scenario

4 Method

4.1 Overview

4.2 Select the required protocol number

4.3 Create the raw socket

4.4 Optionally, set the IP_HDRINCL socket option

4.5 Send the datagram

4.6 Send the datagram (using sendto)

4.7 Send the datagram (using sendmsg)

5 Variations
5.1 Sending to the IPv4 broadcast address

6 Alternatives
6.1 Sending at the link layer

7 See also

8 Further reading

Tested on
Debian (Lenny)
Ubuntu (Lucid)

Objective
To send an arbitrary IPv4 datagram using a raw socket in C

Background
Most programs that communicate using the Internet Protocol do so through a transport-layer
protocol such as TCP or UDP and have no need to deal directly with Internet Protocol datagrams,
but there are some circumstances where it is necessary to interact with the network stack at a lower
level. These include:
implementation of transport-layer protocols that are not built in to the
network stack, and
production of malformed or otherwise non-standard datagrams for testing
purposes.

Scenario
Suppose that you wish to send an ICMP echo request to a given IPv4 address. (This is what
the ping command does to determine whether there is a reachable host at that address.)
There is no POSIX API call that provides this functionality per se. You therefore intend to assemble
an ICMP message with the required content then send it as the payload of an IP datagram using a
raw socket.

Method
Overview
The method described here has five steps:
1. Select the required protocol number.
2. Create the raw socket.
3. Optionally, set the

IP_HDRINCL

socket option.

4. Construct the datagram.


5. Send the datagram.
The following header files will be needed:
#include
#include
#include
#include
#include

<errno.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>

Note that POSIX-compatible operating systems are not obliged to support raw sockets at all, and
the API that has been fully standardised is quite restrictive. For this reason it is often necessary for
programs that use raw sockets to stray into the realm of implementation-defined behaviour. They
are also likely to require elevated privileges in order to run.

Select the required protocol number


All IPv4 traffic is labelled with a protocol number to distinguish between the various transportlayer protocols (such as TCP and UDP) that IPv4 can carry. You will need this number:
when opening the raw socket (unless you choose IPPROTO_RAW for the
protocol number on a system that interprets this as a wildcard), and/or
when constructing the IP datagram header (if you choose to do this
yourself instead of allowing it to be added automatically).

There are several sources from which protocol numbers can be obtained:
Some protocol numbers are defined as constants by the API. POSIX
defines IPPROTO_TCP, IPPROTO_UDP and IPPROTO_ICMP, and glibc defines many
more.
Protocol numbers can be looked up at run time by calling the
function getprotobyname.
IANA maintains a list of assigned protocol numbers.
Unlike a TCP or UDP port number there is little risk of an assigned IP protocol number ever
needing to change, especially for a widely-used protocol such as ICMP. For this reason there is no
real need to look up the protocol number at runtime, and it is quite reasonable for the required value
to be hard-coded.
For this particular example there is a symbolic constant, IPPROTO_ICMP, that all POSIX-compatible
operating systems are supposed to provide. The simplest solution would be to use that. If you
instead want to call getprotobyname then this can be done as follows:
const char* protocol_name="icmp";
struct protoent* protocol=getprotobyname(protocol_name);
if (!protocol) {
die("Protocol %s not found",protocol_name);
}
int protocol_number=protocol->p_proto;

Note that getprotobyname is not thread-safe. In a multi-threaded program it would be advisable to


look up any required protocol numbers at the outset if this is practicable.

Create the raw socket


The socket that will be used to send the IP datagram should be created using the
This takes three arguments:

socket

function.

1. the domain (AF_INET in this case, meaning IPv4),


2. the socket type (SOCK_RAW in this case, meaning that the socket should
provide direct access to the network layer without any transport-layer
protocol), and
3. the protocol (normally corresponding to the
Protocol header).

protocol

field in the Internet

An alternative to specifying the protocol number as the third argument is to use the
value IPPROTO_RAW. POSIX does not generally allow this, but some implementations use it as a

wildcard or a dummy value. (In the case of Linux it allows any protocol to be sent (with headers)
but nothing can be received.)
In this instance the socket will be used for sending ICMP messages, therefore the third argument
should be set to IPPROTO_ICMP:
int fd=socket(AF_INET,SOCK_RAW,IPPROTO_ICMP);
if (fd==-1) {
die("%s",strerror(errno));
}

Optionally, set the IP_HDRINCL socket option


POSIX does not specify the format in which a datagram should be written to a raw socket, however
the following behaviour is typical:
By default the header is generated automatically, therefore only the
payload should be written.
If the IP_HDRINCL socket option is set then the header should be constructed
by the caller and both header and payload written to the socket.
The protocol level for IP_HDRINCL is IPPROTO_IP. The parameter is a boolean value that is usually
represented by an int. It should be set to zero to disable header inclusion or non-zero to enable it:
int hdrincl=1;
if (setsockopt(fd,IPPROTO_IP,IP_HDRINCL,&hdrincl,sizeof(hdrincl))==-1) {
die("%s",strerror(errno));
}

Support for IP_HDRINCL is quite common, but the details vary as to:
the byte order that should be used for each of the header fields (which is
not necessarily the same for all fields), and
which fields (if any) are filled in automatically.
Some operating systems set IP_HDRINCL implicitly when IPPROTO_RAW is selected (on the grounds
that it would make little sense not to supply a header in that case) but others require an explicit call
to setsockopt. If you want to enable header inclusion then it is prudent to set it regardless, in order
to accommodate either behaviour.

Send the datagram


Raw datagrams can in principle be sent using any function that is capable of writing to a file
descriptor, however it is often necessary to use either sendto or sendmsg so that a destination
address can be specified. There are two possible reasons for this:

If the header will be constructed automatically then the network stack


needs to know what the destination address field should be set to.
You may want to route the datagram towards an address that differs from
the one specified in the IP header.
Of sendto and sendmsg the latter is the more flexible option, but at the cost of a signficiantly more
complex interface. Details for each function are given below.
Regardless of which function you choose, each function call will result in a separate datagram
being sent. For this reason you must either compose each datagram payload as a single, contiguous
block of memory, or make use of the scatter/gather capability provided by sendmsg.
In this particular example the payload to be sent is an ICMP echo request, which can be constructed
as follows:
const size_t req_size=8;
struct icmphdr req;
req.type=8;
req.code=0;
req.checksum=0;
req.un.echo.id=htons(rand());
req.un.echo.sequence=htons(1);
req.checksum=ip_checksum(&req,req_size);

This makes use of the icmphdr structure provided by glibc and the ip_checksum function described
in the microHOWTO Calculate an Internet Protocol checksum in C. Note that sizeof(req) cannot
be used to obtain the size of the payload because struct icmphdr is not specific to echo requests, so
the constant req_size has been defined for this purpose.

Send the datagram (using sendto)


To call sendto you must supply the content of the datagram and the remote address to which it
should be sent:
if (sendto(fd,&req,req_size,0,
res->ai_addr,res->ai_addrlen)==-1) {
die("%s",strerror(errno));
}

The fourth argument is for specifying flags which modify the behaviour of sendto, none of which
are needed in this example.
The value returned by sendto is the number of bytes sent, or -1 if there was an error. Raw
datagrams are sent atomically, so unlike when writing to a TCP socket there is no need to wrap the
function call in a loop to handle partially-sent data.

Send the datagram (using sendmsg)


To call sendmsg, in addition to the datagram content and remote address you must also construct
an iovec array and a msghdr structure:
struct iovec iov[1];
iov[0].iov_base=&req;
iov[0].iov_len=req_size;
struct msghdr message;
message.msg_name=res->ai_addr;
message.msg_namelen=res->ai_addrlen;
message.msg_iov=iov;
message.msg_iovlen=1;
message.msg_control=0;
message.msg_controllen=0;
if (sendmsg(fd,&message,0)==-1) {
die("%s",strerror(errno));
}

The purpose of the iovec array is to provide a scatter/gather capability so that the datagram payload
need not be stored in a contiguous region of memory. In this example the entire payload is stored in
a single buffer, therefore only one array element is needed.
The msghdr structure exists to bring the number of arguments to recvmsg and sendmsg down to a
managable number. On entry to sendmsg it specifies where the destination address, the datagram
payload and any ancillary data are stored. In this example no ancillary data has been provided.
If you wish to pass any flags into sendmsg then this cannot be done using msg_flags, which is
ignored on entry. Instead you must pass them using the third argument to sendmsg (which is zero in
this example).

Variations
Sending to the IPv4 broadcast address
By default, attempts to send a datagram to the broadcast address are rejected with an error
(typically EACCES, however it is not obvious from the POSIX specification which error should
occur). This is a safety measure intended to reduce the risk of making unintended broadcasts. It can
be overridden by setting the SO_BROADCAST socket option:
int broadcast=1;
if (setsockopt(fd,SOL_SOCKET,SO_BROADCAST,
&broadcast,sizeof(broadcast))==-1) {
die("%s",strerror(errno));
}

Alternatives
Sending at the link layer
See:

Send an arbitrary Ethernet frame using libpcap


Send an arbitrary Ethernet frame using an AF_PACKET socket in C

Raw sockets of the type described above operate at the network layer. An alternative would be to
inject packets at the link layer, for example in the form of Ethernet frames. This can be done using
libpcap or (on Linux-based systems) using an AF_PACKET socket.
This approach makes it possible to implement any network-layer protocol, whether or not it is
explicitly supported by the network stack, but also brings a number of disadvantages which result
from operating at a lower level of abstraction:
The sender must construct the network layer header, and depending on
the method of injection, perhaps also the link layer header.
The sender must take responsibility for routing and link-layer address
resolution (although it may be possible to delegate these tasks back to
the operating system rather than implementing them from scratch).
The above cannot normally be done without knowledge of the link layer
protocol, which will typically need to be coded into the sending program
on a case-by-case basis.
For these reasons, use of a raw socket is recommended unless you specifically need the extra
functionality provided by working at the link layer.

See also
Send a UDP datagram in C
Establish a TCP connection in C

Further reading
raw(7) (Linux manpage)
The Open Group, sendto, Base Specifications Issue 6
The Open Group, sendmsg, Base Specifications Issue 6
ithilgore, SOCK_RAW Demystified, May 2008

Send an arbitrary Ethernet frame using libpcap


Content

1 Objective

2 Background

3 Scenario

4 Method

4.1 Overview

4.2 Select the required EtherType

4.3 Construct the Ethernet frame

4.4 Obtain a PCAP descriptor by calling pcap_open_live

4.5 Send the Ethernet frame by calling pcap_inject

4.6 Close the PCAP descriptor by calling pcap_close

5 Example program

6 Alternatives

6.1 Using an AF_PACKET socket

6.2 Using a raw socket

7 Further reading

Tested on
Debian (Lenny, Squeeze)
Ubuntu (Lucid)

Objective
To send an arbitrary Ethernet frame using libpcap

Background
Ethernet is a link layer protocol. Most networking programs interact with the network stack at the
transport layer or above, so have no need to deal with Ethernet frames directly, but there are some
circumstances where interaction at a lower level may be necessary. These include:
implementation of Ethernet-based protocols that are not built in to the
network stack, and
production of malformed or otherwise non-standard frames for testing
purposes.

Scenario
Suppose that you wish to send an ARP request for a given IP address from a given Ethernet
interface. You wish to use libpcap to perform the sending.

(ARP is the Address Resolution Protocol. It is used when a host needs to send a datagram to a given
IP address, but does not know which MAC address corresponds to that IP address. It is described
in RFC 826.)

Method
Overview
The method described here has five steps:
1. Select the required EtherType.
2. Construct the Ethernet frame.
3. Obtain a PCAP descriptor by calling

pcap_open_live.

4. Send the Ethernet frame by calling

pcap_inject.

5. Close the PCAP descriptor by calling

pcap_close.

The following header files are used:


Header

Used by

<stdio.h>

fprintf

<stdlib.h>

exit

<pcap.h>

pcap_open_live, pcap_inject, pcap_close, pcap_perror

Be aware that:
Not all network devices are Ethernet interfaces, or use an Ethernetcompatible frame format, or support packet injection using libpcap.
Although a link-layer header must be supplied, libpcap does not promise
to use it as-is: both the source address and the EtherType are at risk of
being altered.
Programs that send raw packets, using this or any other method, are likely to require elevated
privileges in order to run.

Select the required EtherType


The EtherType of an Ethernet frame specifies the type of payload that it contains. There are several
sources from which EtherTypes can be obtained:

On Linux-based systems the header file <linux/if_ether.h> provides


constants for most commonly-used EtherTypes. Examples
includeETH_P_IP for the Internet Protocol (0x8000), ETH_P_ARP for the Address
Resolution Protocol (0x0806) and ETH_P_8021Q for IEEE 802.1Q VLAN tags
(0x8100).
The IEEE maintains the definitive list of registered EtherTypes.
A semi-official list is maintained by IANA.
If you need an EtherType for experimental or private use then the values
been reserved for that purpose.

0x88b5

and 0x88b6 have

Construct the Ethernet frame


Frames sent using libpcap must:
have a link-layer header (there is no option for this to be added
automatically), and
be presented to libpcap as a single, contiguous block of memory (there is
no equivalent of the scatter/gather capability provided by readmsg and
sendmsg).
See the example program below for how this might be done in the specific case where you want to
send an ARP request. Be aware that:
Most network protocols require that multi-byte values be converted to
network byte order.
Structures may have padding added by the compiler (although ones
provided by system headers ought to be safe).
C and C++ place restrictions on when pointer casts can be safely used to
convert data from one type to another.
You will probably need to know the MAC address of the interface from which the packet will be
sent. On Linux-based systems this can be obtained using the ioctl command SIOCGIFHWADDR. See
the microHOWTO Get the MAC address of an Ethernet interface in C using
SIOCGIFHWADDR for details.
As noted previously, libpcap does not provide guarantee that the link-layer header that is sent will
be identical to the one that was provided.

Obtain a PCAP descriptor by calling pcap_open_live


To access a network interface via libpcap it is necessary to have an open packet capture descriptor.
This is a pointer of type pcap_t* and can be obtained by calling pcap_open_live:
char pcap_errbuf[PCAP_ERRBUF_SIZE];
pcap_errbuf[0]='\0';
pcap_t* pcap=pcap_open_live(if_name,96,0,0,pcap_errbuf);
if (pcap_errbuf[0]!='\0') {
fprintf(stderr,"%s",pcap_errbuf);
}
if (!pcap) {
exit(1);
}

The first argument to pcap_open_live is the name of the interface from which the Ethernet frame is
to be sent, for example eth0. (Remember that not all interfaces are suitable for sending Ethernet
frames.)
The second, third and fourth arguments are the snapshot length, promiscuous mode flag and
timeout. These control how packets are captured, and for the task in hand it is unimportant what
values are used, but if you want to capture as well as send then you will need to ensure that they
have been set appropriately (especially the snapshot length).
The last argument points to a buffer for returning error messages, which must be at
least PCAP_ERRBUF_SIZE bytes long. As suggested on thepcap_open_live manpage, this has been set
to the empty string before the function call then inspected afterwards in order to detect both
warnings and errors.

Send the Ethernet frame by calling pcap_inject


Given a PCAP descriptor, frames can be sent by calling the function pcap_inject:
if (pcap_inject(pcap,&req,sizeof(req))==-1) {
pcap_perror(pcap,0);
pcap_close(pcap);
exit(1);
}

The value returned by pcap_inject is the number of bytes sent, or -1 if there was an error. In the
latter case a human-readable error message can be obtained using pcap_geterr or (as in this
example) printed using pcap_perror.

Close the PCAP descriptor by calling pcap_close


The PCAP descriptor should be closed once it is no longer needed:
pcap_close(pcap)

Example program

The following example program constructs and sends an ARP request using the method described
above:
send_arp.c

It can be compiled using the command:


gcc -lpcap -o send_arp send_arp.c

When invoked it takes two arguments, the name of the Ethernet interface and the (numeric) IP
address to which the ARP request should be directed:
./send_arp eth0 192.168.0.83

Alternatives
Using an AF_PACKET socket
See:

Send an arbitrary Ethernet frame using an AF_PACKET socket in C

On Linux-based systems an alternative way to send an Ethernet frame is to use


an AF_PACKET socket. This has some advantages over the use of libpcap:
It allows packets to be written directly to a POSIX socket descriptor,
making it possible to use facilities such as scatter/gather and nonblocking output, and providing compatibility with libraries like libevent
that act on file descriptors.
It offers a choice between having the link-layer header supplied by the
sender or constructed by the network stack.
It removes a layer of indirection, and the need for libpcap to be present at
compile time or run time.
The main drawback of AF_PACKET sockets their lack of portability. They are specific to Linux
(version 2.2 and later), and for this reason they are not recommended where the use of libpcap (or a
raw socket) is a viable alternative.

Using a raw socket


See:

Send an arbitrary IPv4 datagram using a raw socket in C

Raw sockets differ from packet sockets in that they operate at the network layer as opposed to the
link layer. For this reason they are limited to network protocols for which raw socket support has

been explicitly built into the network stack, but they also have a number of advantages which result
from operating at a higher level of abstraction:
You can write code that will work with any suitable type of network
interface.
Routing and link-layer address resolution are handled for you.
The network layer header is constructed for you unless you request
otherwise.
The raw socket API has been partially standardised by POSIX.
For these reasons, use of a raw socket is recommended unless you specifically need the extra
functionality provided by working at the link layer.

Further reading
PCAP(3) (libpcap manpage)

Full Source below for sending via libpcap:


// Purpose: to construct an ARP request and write it to an Ethernet interface
// using libpcap.
//
// See: "Send an arbitrary Ethernet frame using libpcap"
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pcap.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <net/ethernet.h>
#include <netinet/if_ether.h>
#include <sys/ioctl.h>
int main(int argc,const char* argv[]) {
// Get interface name and target IP address from command line.
if (argc<2) {
fprintf(stderr,"usage: send_arp <interface> <ipv4-address>\n");
exit(1);
}
const char* if_name=argv[1];
const char* target_ip_string=argv[2];
// Construct Ethernet header (except for source MAC address).
// (Destination set to broadcast address, FF:FF:FF:FF:FF:FF.)
struct ether_header header;
header.ether_type=htons(ETH_P_ARP);

memset(header.ether_dhost,0xff,sizeof(header.ether_dhost));
// Construct ARP request (except for MAC and IP addresses).
struct ether_arp req;
req.arp_hrd=htons(ARPHRD_ETHER);
req.arp_pro=htons(ETH_P_IP);
req.arp_hln=ETHER_ADDR_LEN;
req.arp_pln=sizeof(in_addr_t);
req.arp_op=htons(ARPOP_REQUEST);
memset(&req.arp_tha,0,sizeof(req.arp_tha));
// Convert target IP address from string, copy into ARP request.
struct in_addr target_ip_addr={0};
if (!inet_aton(target_ip_string,&target_ip_addr)) {
fprintf(stderr,"%s is not a valid IP address",target_ip_string);
exit(1);
}
memcpy(&req.arp_tpa,&target_ip_addr.s_addr,sizeof(req.arp_tpa));
// Write the interface name to an ifreq structure,
// for obtaining the source MAC and IP addresses.
struct ifreq ifr;
size_t if_name_len=strlen(if_name);
if (if_name_len<sizeof(ifr.ifr_name)) {
memcpy(ifr.ifr_name,if_name,if_name_len);
ifr.ifr_name[if_name_len]=0;
} else {
fprintf(stderr,"interface name is too long");
exit(1);
}
// Open an IPv4-family socket for use when calling ioctl.
int fd=socket(AF_INET,SOCK_DGRAM,0);
if (fd==-1) {
perror(0);
exit(1);
}
// Obtain the source IP address, copy into ARP request
if (ioctl(fd,SIOCGIFADDR,&ifr)==-1) {
perror(0);
close(fd);
exit(1);
}
struct sockaddr_in* source_ip_addr = (struct sockaddr_in*)&ifr.ifr_addr;
memcpy(&req.arp_spa,&source_ip_addr->sin_addr.s_addr,sizeof(req.arp_spa));
// Obtain the source MAC address, copy into Ethernet header and ARP request.
if (ioctl(fd,SIOCGIFHWADDR,&ifr)==-1) {
perror(0);
close(fd);
exit(1);
}
if (ifr.ifr_hwaddr.sa_family!=ARPHRD_ETHER) {
fprintf(stderr,"not an Ethernet interface");
close(fd);
exit(1);
}
const unsigned char* source_mac_addr=(unsigned char*)ifr.ifr_hwaddr.sa_data;
memcpy(header.ether_shost,source_mac_addr,sizeof(header.ether_shost));
memcpy(&req.arp_sha,source_mac_addr,sizeof(req.arp_sha));
close(fd);

// Combine the Ethernet header and ARP request into a contiguous block.
unsigned char frame[sizeof(struct ether_header)+sizeof(struct ether_arp)];
memcpy(frame,&header,sizeof(struct ether_header));
memcpy(frame+sizeof(struct ether_header),&req,sizeof(struct ether_arp));
// Open a PCAP packet capture descriptor for the specified interface.
char pcap_errbuf[PCAP_ERRBUF_SIZE];
pcap_errbuf[0]='\0';
pcap_t* pcap=pcap_open_live(if_name,96,0,0,pcap_errbuf);
if (pcap_errbuf[0]!='\0') {
fprintf(stderr,"%s\n",pcap_errbuf);
}
if (!pcap) {
exit(1);
}
// Write the Ethernet frame to the interface.
if (pcap_inject(pcap,frame,sizeof(frame))==-1) {
pcap_perror(pcap,0);
pcap_close(pcap);
exit(1);
}
// Close the PCAP descriptor.
pcap_close(pcap);
return 0;
}

Get the MAC address of an Ethernet interface in C


using SIOCGIFHWADDR
Content

1 Objective

2 Scenario

3 Method

3.1 Overview

3.2 Create an ifreq structure for passing data in and out of ioctl

3.3 Provide an open socket descriptor

3.4 Invoke ioctl

3.5 Check the type of the returned hardware address

3.6 Extract the hardware address from the ifreq structure

4 See also

5 Further reading

Tested on
Debian (Lenny, Squeeze)
Ubuntu (Lucid, Precise)

Objective
To get the MAC address of an Ethernet interface in C using the ioctl command SIOCGIFHWADDR

Scenario
Suppose you wish to display the MAC address of an Ethernet interface. The variable
to a null-terminated string containing the name of the interface (for example, eth0).

Method
Overview
On Linux-based systems the MAC address of an interface can be obtained using
the ioctl command SIOCGIFHWADDR. The method described here has five steps:
1. Create an

ifreq

structure for passing data in and out of

2. Provide an open socket descriptor.


3. Invoke

ioctl.

4. Check the type of the returned hardware address.


5. Extract the hardware address from the
The following header files will be needed:
#include
#include
#include
#include
#include
#include

<errno.h>
<string.h>
<stdio.h>
<sys/ioctl.h>
<net/if.h>
<net/if_arp.h>

ifreq

structure.

ioctl.

if_name

points

Create an ifreq structure for passing data in and out of ioctl


The ifreq structure should initially contain the name of the interface to be queried, which should be
copied into the ifr_name field. Since this is a fixed-length buffer you should take care to ensure that
the name does not cause an overrun:
struct ifreq ifr;
size_t if_name_len=strlen(if_name);
if (if_name_len<sizeof(ifr.ifr_name)) {
memcpy(ifr.ifr_name,if_name,if_name_len);
ifr.ifr_name[if_name_len]=0;
} else {
die("interface name is too long");
}

Provide an open socket descriptor


The socket descriptor is merely an artefact of the way in which ioctl commands are invoked
generally, and is not used for any particular purpose by SIOCGIFHWADDR. It must be open and must
refer to a socket (as opposed to, for example, a regular file). Any type of socket would suffice, but it
should preferably not be one that requires any obscure kernel modules to be loaded. For this
example a UNIX domain socket will be used:
int fd=socket(AF_UNIX,SOCK_DGRAM,0);
if (fd==-1) {
die("%s",strerror(errno));
}

Invoke ioctl
Once you have the ifreq structure and socket descriptor then you are ready to invoke ioctl:
if (ioctl(fd,SIOCGIFHWADDR,&ifr)==-1) {
int temp_errno=errno;
close(fd);
die("%s",strerror(temp_errno));
}
close(fd);

If this completes without error then the hardware address of the interface should have been returned
in ifr.ifr_hwaddr in the form of a struct sockaddr.

Check the type of the returned hardware address


The length and format of the hardware address will depend on the type of network interface it
belongs to, so you should not assume that it is an Ethernet MAC address. You can check the
address type by inspecting the sa_family field of the sockaddr. For an Ethernet interface this should
be equal to ARPHRD_ETHER:
if (ifr.ifr_hwaddr.sa_family!=ARPHRD_ETHER) {
die("not an Ethernet interface");

Other possible values of sa_family for different types of network interface can be found in the
header file <net/if_arp.h>, each beginning with the prefix ARPHRD_. Note that for some of these
(such as ARPHRD_LOOPBACK) there is no hardware address as such.

Extract the hardware address from the ifreq structure


Having checked its type, the address can now be safely extracted from req.ifr_hwaddr.sa_data. It
is presented by an array of char, which could be a signed type, so if you wish to interpret it in any
way then it should first be converted to an unsigned representation. A crude but straightforward
way to achieve this is to cast the whole array to an unsigned char*:
const unsigned char* mac=(unsigned char*)ifr.ifr_hwaddr.sa_data;
printf("%02X:%02X:%02X:%02X:%02X:%02X\n",
mac[0],mac[1],mac[2],mac[3],mac[4],mac[5]);

See also
Get the IP address of a network interface in C using SIOCGIFADDR

Further reading
netdevice(7) (Linux manpage)

Get the IP address of a network interface in C using


SIOCGIFADDR

Content

1 Objective

2 Scenario

3 Method

3.1 Overview

3.2 Create an ifreq structure for passing data in and out of ioctl

3.3 Provide an open socket descriptor with the address family AF_INET

3.4 Invoke ioctl

3.5 Extract the IP address from the ifreq structure

4 See also

5 Further reading

Tested on
Debian (Lenny)
Ubuntu (Precise, Trusty)

Objective
To get the IPv4 address of a network interface in C using the ioctl command SIOCGIFADDR

Scenario
Suppose that you wish to display the IPv4 address of a network interface. The
variable if_name points to a null-terminated string containing the name of the interface (for
example, eth0).

Method
Overview
On Linux-based systems, one way to obtain the IPv4 address of an interface is to use
the ioctl command SIOCGIFADDR. The method described here has four steps:
1. Create an

ifreq

structure for passing data in and out of

ioctl.

2. Provide an open socket descriptor with the address family

AF_INET.

3. Invoke

ioctl.

4. Extract the IP address from the

ifreq

structure.

The following header files are needed when using this method:
#include <sys/ioctl.h>
#include <net/if.h>
#include <netinet/in.h>

In addition, this particular implementation makes use of:


#include
#include
#include
#include

<errno.h>
<string.h>
<stdio.h>
<arpa/inet.h>

Please note that whilst this method can be used with some network protocols other than IPv4, the
Linux implementation does not support IPv6. Furthermore it is only able to return a single result for
any given network protocol, so will only return one of the addresses of an interface that has several.
It is not necessarily portable to other POSIX-compatible systems, and is no longer the preferred
method on Linux.

Create an ifreq structure for passing data in and out of ioctl


The ifreq structure should initially contain the name of the interface to be queried, which should be
copied into the ifr_name field. Since this is a fixed-length buffer you should take care to ensure that
the name does not cause an overrun:
struct ifreq ifr;
size_t if_name_len=strlen(if_name);
if (if_name_len<sizeof(ifr.ifr_name)) {
memcpy(ifr.ifr_name,if_name,if_name_len);
ifr.ifr_name[if_name_len]=0;
} else {
die("interface name is too long");
}

Provide an open socket descriptor with the address family AF_INET


All ioctl calls need a file descriptor to act on. In the case of SIOCGIFADDR this must refer to a socket
(as opposed to, for example, a regular file) and must be of the address family that you wish to
obtain (AF_INET in this instance). Otherwise any type of socket would suffice, but it should
preferably not be one that requires any obscure kernel modules to be loaded. For this example a
UDP socket will be used:
int fd=socket(AF_INET,SOCK_DGRAM,0);
if (fd==-1) {
die("%s",strerror(errno));

Invoke ioctl
Once you have the ifreq structure and socket descriptor then you are ready to invoke ioctl:
if (ioctl(fd,SIOCGIFADDR,&ifr)==-1) {
int temp_errno=errno;
close(fd);
die("%s",strerror(temp_errno));
}
close(fd);

If this completes without error then the hardware address of the interface should have been returned
in ifr.ifr_addr in the form of a struct sockaddr_in.

Extract the IP address from the ifreq structure


If an address was returned at all then it ought to be an IPv4 address, because that was the address
family of the socket. To obtain the numerical value of the address you should:
1. Cast the returned address to a
2. Extract the

sin_addr

struct sockaddr_in.

field of this structure to obtain a

struct in_addr.

3. Extract the s_addr field of the in_addr structure to obtain


an in_addr_t (equivalent to a uint32_t).
4. Finally, convert the s_addr field (which is in network byte order) into
whatever representation you require.
struct sockaddr_in* ipaddr = (struct sockaddr_in*)&ifr.ifr_addr;
printf("IP address: %s\n",inet_ntoa(ipaddr->sin_addr));

See also
Get the IP address of a network interface in C using SIOCGIFADDR

Further reading
netdevice(7), Linux manpage
(Note that SIOCGIFADDR was not documented in netdevice(7) until version 3.40 of the Linux manpages project, which was released in April 2012, so at the time of writing it had not been
incorporated into the stable releases of most GNU/Linux distributions. The ioctl itself has been
present in Linux since 1993.)

Get the index number of a Linux network interface in C


using SIOCGIFINDEX
Content

1 Objective

2 Background

3 Scenario

4 Method

4.1 Overview

4.2 Create an ifreq structure for passing data in and out of ioctl

4.3 Provide an open socket descriptor

4.4 Invoke ioctl

5 Further reading

Tested on
Debian (Lenny, Squeeze)
Ubuntu (Lucid, Precise,
Trusty)

Objective
To get the index number of a Linux network interface in C using the ioctl command SIOCGIFINDEX

Background
Network interfaces are usually identified by name in user-facing contexts, but for some APIs a
number is used instead. A notable example is the sin6_scope_id field of an IPv6 socket address with
link scope. Indices are also used in some types of netlink message (particularly those concerned
with routing) and in socket addresses for AF_PACKET sockets.
The interface index is typically not the same as the suffix which may form part of the interface
name. For example, on one of the machines tested by the author, eth0 had an index of 2. You
should not assume that they will be the same on other machines, or that they will necessarily
remain the same following a reboot.

Scenario
Suppose you wish to send a raw Ethernet frame using an AF_PACKET socket. To do this you need to
know the index number of the network interface from which the frame is to be sent.
The variable if_name points to a null-terminated string containing the name of the interface.

Method
Overview
On Linux-based systems the index number of a network interface can be obtained using
the ioctl command SIOCGIFINDEX. The method described here has five steps:
1. Create an

ifreq

structure for passing data in and out of

ioctl.

2. Provide an open socket descriptor.


3. Invoke

ioctl.

The following header files will be needed:


#include
#include
#include
#include

<errno.h>
<string.h>
<sys/ioctl.h>
<net/if.h>

Create an ifreq structure for passing data in and out of ioctl


The ifreq structure should initially contain the name of the interface to be queried, which should be
copied into the ifr_name field. Since this is a fixed-length buffer you should take care to ensure that
the name does not cause an overrun:
struct ifreq ifr;

size_t if_name_len=strlen(if_name);
if (if_name_len<sizeof(ifr.ifr_name)) {
memcpy(ifr.ifr_name,if_name,if_name_len);
ifr.ifr_name[if_name_len]=0;
} else {
die("interface name is too long");
}

Provide an open socket descriptor


The socket descriptor is merely an artefact of the way in which ioctl commands are invoked
generally, and is not used for any particular purpose by SIOCGIFINDEX. It must be open and must
refer to a socket (as opposed to, for example, a regular file).
In many of the circumstances were you would use SIOCGIFINDEX there will already be an open
socket that you can use. For example, in the particular scenario described above you could open
the AF_PACKET socket first and use that. Otherwise, you will need to open one specifically for the
purpose of being an argument to ioctl. Any type of socket would suffice, but it should preferably
not be one that requires any obscure kernel modules to be loaded:
int fd=socket(AF_UNIX,SOCK_DGRAM,0);
if (fd==-1) {
die("%s",strerror(errno));
}

Invoke ioctl
Once you have the ifreq structure and socket descriptor then you are ready to invoke ioctl:
if (ioctl(fd,SIOCGIFINDEX,&ifr)==-1) {
die("%s",strerror(errno));
}

If this completes without error then the interface index should have been returned
in ifr.ifr_ifindex.

Further reading
netdevice(7) (Linux manpage)

Cause a process to become a daemon in C

Content

1 Objective

2 Background and Scenario

3 Method

3.1 Fork, allowing the parent process to terminate

3.2 Start a new session for the daemon by calling setsid

3.3 Fork again, allowing the parent process to terminate

3.4 Change the current working directory to a safe location

3.5 Set the umask to zero

3.6 Close then reopen stdin, stdout and stderr

3.7 The complete method as a function

4 Testing

5 Variations

5.1 Redirect stdout and stderr to a logfile

5.2 Using SIGHUP for other purposes

6 Methods to avoid
6.1 Use the daemon function

Tested on
Debian (Etch, Lenny,
Squeeze)
Fedora (14)
Ubuntu (Hardy, Intrepid,
Jaunty, Karmic, Lucid,
Maverick, Natty, Precise,
Trusty)

Objective
To cause a process to become a daemon in C

Background and Scenario


See Cause a process to become a daemon. That page also gives a more detailed rationale for the
method, which is explained only in outline here.
A mechanism is needed for handling errors. The example code shown below assumes that there is a
function called die provided for this purpose, which takes the same arguments as printf and does
not return.

Method
Fork, allowing the parent process to terminate
Calling fork has three possible types of return value:
-1 indicates failure (most likely due to lack of memory, although it is
possible to run out of other resources such as PIDs).
0 indicates that the child is running, in which case execution should
continue with the next step of the daemonisation process.
Any other value indicates that the parent is running, in which case the
process should terminate by calling _exit.
pid_t pid = fork();
if (pid == -1) {
die("failed to fork while daemonising (errno=%d)",errno);
} else if (pid != 0) {
_exit(0);
}

Start a new session for the daemon by calling setsid


This operation should never fail, because the current process should not now be a process group
leader, however we check anyhow as a precaution:
if (setsid()==-1) {
die("failed to become a session leader while daemonising(errno=%d)",errno);
}

Fork again, allowing the parent process to terminate


This is a repeat of the first step, except that a handler must be installed for SIGHUP:
signal(SIGHUP,SIG_IGN);
pid=fork();
if (pid == -1) {
die("failed to fork while daemonising (errno=%d)",errno);
} else if (pid != 0) {
_exit(0);

The SIGHUP handler must remain in place until it has absorbed the SIGHUP that the parent is expected
to send when it terminates. See below if you wish to install a SIGHUP handler for other purposes.

Change the current working directory to a safe location


The root directory is used here, as it is always a safe location and can be changed later if required:
if (chdir("/") == -1) {
die("failed to change working directory while daemonising (errno=%d)",errno);
}

Set the umask to zero


Daemons normally operate with a umask of zero. Again, this can be changed later if required:
umask(0);

Close then reopen stdin, stdout and stderr


The POSIX specification requires that /dev/null be provided, therefore the daemon can reasonably
depend on this device being available provided that they fail gracefully if it is not.
When stderr is opened it must be both readable and writable. It is sufficient for stdin to be
readable and stdout to be writable. If stdout orstderr refer to a regular file then they should be
configured to append to it (by means of the O_APPEND flag). Because the open function always
chooses the lowest unused file descriptor, by reopening the streams in ascending order it is possible
to avoid the use of dup2:
close(STDIN_FILENO);
close(STDOUT_FILENO);
close(STDERR_FILENO);
if (open("/dev/null",O_RDONLY) == -1) {
die("failed to reopen stdin while daemonising (errno=%d)",errno);
}
if (open("/dev/null",O_WRONLY) == -1) {
die("failed to reopen stdout while daemonising (errno=%d)",errno);
}
if (open("/dev/null",O_RDWR) == -1) {
die("failed to reopen stderr while daemonising (errno=%d)",errno);
}

See below if you want to direct stdout and stderr to a logfile.

The complete method as a function


#include
#include
#include
#include

<errno.h>
<signal.h>
<fcntl.h>
<unistd.h>

void daemonise() {
// Fork, allowing the parent process to terminate.
pid_t pid = fork();
if (pid == -1) {
die("failed to fork while daemonising (errno=%d)",errno);
} else if (pid != 0) {
_exit(0);
}
// Start a new session for the daemon.
if (setsid()==-1) {
die("failed to become a session leader while daemonising(errno=%d)",errno);
}
// Fork again, allowing the parent process to terminate.
signal(SIGHUP,SIG_IGN);
pid=fork();
if (pid == -1) {
die("failed to fork while daemonising (errno=%d)",errno);
} else if (pid != 0) {
_exit(0);
}
// Set the current working directory to the root directory.
if (chdir("/") == -1) {
die("failed to change working directory while daemonising (errno=%d)",errno);
}
// Set the user file creation mask to zero.
umask(0);
// Close then reopen standard file descriptors.
close(STDIN_FILENO);
close(STDOUT_FILENO);
close(STDERR_FILENO);
if (open("/dev/null",O_RDONLY) == -1) {
die("failed to reopen stdin while daemonising (errno=%d)",errno);
}
if (open("/dev/null",O_WRONLY) == -1) {
die("failed to reopen stdout while daemonising (errno=%d)",errno);
}
if (open("/dev/null",O_RDWR) == -1) {
die("failed to reopen stderr while daemonising (errno=%d)",errno);
}
}

Testing
See Cause a process to become a daemon.

Variations
Redirect stdout and stderr to a logfile
When directing output to a logfile, it is best to open the file before closing
daemon is not left with no means of reporting errors:

stderr

close(STDIN_FILENO);
if (open("/dev/null",O_RDONLY) == -1) {
die("failed to reopen stdin while daemonising (errno=%d)",errno);
}
int logfile_fileno = open(logfile_pathname,O_RDWR|O_CREAT|O_APPEND,S_IRUSR|S_IWUSR|S_IRGRP);
if (logfile_fileno == -1) {
die("failed to open logfile (errno=%d)",errno);
}
dup2(logfile_fileno,STDOUT_FILENO);

to ensure that the

dup2(logfile_fileno,STDERR_FILENO);
close(logfile_fileno);

Note that dup2 will close the target file descriptor if necessary, so there is no need to do this
explicitly.

Using SIGHUP for other purposes


Daemons often interpret SIGHUP as a request to reread the configuration file. A signal handler must
be installed to perform this function, however it must not become fully active until after the parent
process of the second fork operation has terminated (as that event will generate a SIGHUP).
One solution is to use a flag within the handler function to treat the first call differently:
void handle_sighup(int signum) {
static bool first=true;
if (first) {
first=false;
return;
}
// Insert remainder of handler here.
}

When installing the signal handler, it is better to use sigaction in preference to the signal function
because that allows the SA_RESTART flag to be used. Without this, it is necessary to place a loop
around any system function that is capable of returning EINTR:
struct sigaction sa;
sa.sa_handler = handle_sighup;
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_RESTART;
if (sigaction(SIGHUP,&sa,0) == -1) {
die("failed to install SIGHUP handler (errno=%d)",errno);
}

Methods to avoid
Use the daemon function
Many POSIX-based operating systems provide a function called daemon which performs some or all
of the steps listed above. Unfortunately it has three significant drawbacks:
It is not available on all systems.
Its behaviour is not standardised (or necessarily well-documented).
Its behaviour is more difficult to customise.
For these reasons, any benefit gained by using the daemon function is likely to be a short-term one at
best.

Tags: c | posix | process

Pad an integer with leading zeros in C++


Content

1 Objective

2 Scenario

3 Method

4 Alternative
4.1 Using sprintf

Tested on
Ubuntu (Lucid, Precise)

Objective
To pad an integer with leading zeros to a given minimum width when converting it to a character
string in C++

Scenario
Suppose you are writing a program for generating customer invoices. Each customer has an
account number. These are represented internally as integers, but when converted to character
strings for display or printing you want them to be padded to 8 digits using leading zeros.

Method
The method described here uses the C++ iostream library to perform the conversion. It requires an
output stream for the result to be sent to, however a std::ostringstream can be used to capture the
character sequence and present it as a std::string if required. Padding with zeros is achieved by
combining the effect of three standard manipulators:

std::setw,

std::setfill,

to specify the required width in characters of the next field


written to the stream,
to specify the character used for padding if the required width
of a field is greater than its natural width, and

std::internal,

to arrange for padding to occur after the sign but before the
remainder of the number.

The required header files are:


Header

Used by

<ios>

std::internal

<iomanip>

std::setw, std::setfill

<sstream>

std::ostringstream

If you are using a std::ostringstream that will be discarded immediately after the conversion then
simply write the three manipulators to the stream (in any order) followed by the value to be
converted:
std::string format_account_number(int acct_no) {
ostringstream out;
out << std::internal << std::setfill('0') << std::setw(8) << acct_no;
return out.str();
}

If the stream will be used subsequently for other purposes then you will probably want to reset the
fill character and field adjustment properties, otherwise they will remain in effect for later output. It
is not necessary to do this for the field width, which is automatically reset to zero after each field is
written:
void write_account_number(std::ostream& out, int acct_no) {
out << std::internal << std::setfill('0') << std::setw(8) << acct_no;
out << std::left << std::setfill(' ');
}

The std::internal manipulator can be omitted if the number is unsigned or known to be nonnegative, but it is needed in the general case because because otherwise the padding characters will
be inserted at the far left of the field by default (producing output such as 000000-1 as opposed to
-0000001).
Be aware that because std::setw controls the total width of the field (including the sign if there is
one), with the consequence that negative values will by default be one digit shorter than nonnegative values. If this is a problem then std::showpos can be used to ensure that there is always a
sign (plus or minus), in which case the number of digits remains constant.
Floating point values can be padded in a similar manner. The same applies to character strings,
except that std::internal would be ineffective.

Alternative
Using sprintf
A similar effect can be achieved using std::snprintf from <cstdio>:
std::string format_account_number(int acct_no) {
char buffer[9];
std::snprintf(buffer, sizeof(buffer), "%08d", acct_no);
return buffer;
}

For a typical implementation of the standard library this method is likely to be significantly faster
than using a std::ostringstream (and would be faster still if std::string were avoided too). The
cost is that buffer management and type safety become your responsibility, with undefined
behaviour the likely consequence if you make a mistake.
A minor difference is that std::snprintf will truncate to whatever buffer length you have chosen,
whereas std::ostringstream will not.

Build a shared library using GCC


Content

1 Objective

2 Background

3 Scenario

4 Method

4.1 Overview

4.2 Choose an soname (if required)

4.3 Compile the source code using the -fPIC option

4.4 Link the object code using the -fPIC and -shared options

5 Testing

6 Alternatives

6.1 Using GNU Libtool


7 Further reading

Tested on
Debian (Etch, Lenny,
Squeeze)
Ubuntu (Hardy, Intrepid,
Jaunty, Karmic, Lucid,
Maverick, Natty, Oneiric,
Precise, Quantal)

Objective
To build a shared library using GCC

Background
Programs can be linked against libraries either at compile time or at run time. An advantage of
linking at run time is that a single copy of the library can be shared between many programs, both
on disc and in memory. Libraries suitable for use in this way are known as shared libraries.
On modern Linux-based systems, shared libraries differ from static ones in the following respects:
they are ELF files (as opposed to archives compatible with
the ar program),
they have a dynamic symbol table (in addition to a static table), and
the code within them must be position-independent.
For these reasons, some adjustments to the build process are needed to create a shared library
instead of a static one.

Scenario
Suppose that you are building a library named libqux which is written in C. There are three source
files: foo.c, bar.c and baz.c.
The current version number of libqux is 1.5.0. It is fully backward-compatible with the previous
version, 1.4.1, which had an soname oflibqux.so.1.

Method
Overview
The method described here has three steps:
1. Choose an soname (if required).

2. Compile the source code using the


3. Link the object code using the

-fPIC

-fPIC

and

option.
-shared

options.

Choose an soname (if required)


An soname (shared object name) is a label used when declaring shared library dependencies.
Each executable contains a list of shared libraries that it needs in order to execute. Shared libraries
can similarly declare dependencies on other shared libraries. This can be done using pathnames, but
if the required library has an soname then that will be used in preference.
Typically the pathname of a library will change whenever a new version is installed, whereas the
soname should change only when the new version is incompatible with its predecessors to the
extent that it cannot be used their place. It follows that when dependencies are declared using
sonames, the library used at runtime need not be an exact match for the one present at build time:
For a given library with a given soname, only the most recent version
need be installed.
Where there is a need for two versions of the same library to be installed
alongside each other, they can be distinguished because they have
different sonames.
It is the degree of binary compatibility which determines whether the soname should change. For
example, new functions can be added without breaking backward compatibility, but you cannot
normally change the prototype of an existing function, nor do anything that could change the layout
of a data structure. You should also consider changes made to the high-level behaviour of the
library, as these can have an equally significant effect on backwards compatibility.
In this particular instance, version 1.4.1 of libqux had an soname of libqux.so.1. Since version
1.5.0 is backwards-compatibile it can use the same soname. If this had not been the case then the
soname would have needed to change, most likely to libqux.so.2.

Compile the source code using the -fPIC option


Object code intended for use in a shared library must be position-independent, meaning that it can
execute without first being modified to account for where it has been loaded in memory. It remains
necessary to allow for the location of other libraries, but any internal references are required to be
position-independent.
GCC can be instructed to generate position independent code using the
gcc -c -fPIC -o foo.o foo.c
gcc -c -fPIC -o bar.o bar.c
gcc -c -fPIC -o baz.o baz.c

-fPIC

option:

This option is not enabled by default because it tends to cause some loss of performance, and for
purposes other than building shared libraries it is often not necessary.

Link the object code using the -fPIC and -shared options
The default behaviour of the gcc and g++ commands when linking is to produce an executable
program. They can be instructed to produce a shared library instead by means of -shared option:
gcc -shared -fPIC -Wl,-soname,libqux.so.1 -o libqux.so.1.5.0 foo.o bar.o baz.o -lc

The -fPIC option is needed when linking as it was when compiling to ensure that any code added
by the linker is compatible with code previously generated by the compiler.
The -Wl option passes a comma-separated list of arguments to the linker. As its name suggests, sonamespecifies the required soname. If these options are omitted then the library will not have an
soname.
The ldconfig manpage recommends explicitly linking against libc, which has been done above
using the-l option (-lc).

Testing
One way to test the library is to install it in a directory on the library search path. /usr/local/lib is
usually the most appropriate choice. You will need to create softlinks corresponding to the soname
of the library, and the name used to refer to the library when building the executable, if these are
different from the filename:
ln -s libqux.so.1.5.0 libqux.so.1
ln -s libqux.so.1.5.0 libqux.so

A partial alternative is to run ldconfig, which automatically creates the first of the above softlinks
but not the second. However you do it, this method of testing normally requires administrative
privileges. Once installed, it should be possible to link against the library using -l:
gcc main.c -lqux

If you cannot or do not want to move the library to /usr/local/lib then it is possible to link against
the library in situ. At build time this can be done by listing the pathname of the library as an
argument to gcc without use of the -l option:
gcc main.c libqux.so.1.5.0

At load time you will need to add the relevant directory to the library search path. This can be done
by setting the environment variableLD_LIBRARY_PATH, for example:

export LD_LIBRARY_PATH=`pwd`

As above, you will need to create a softlink corresponding to the soname of the library. If there is a
need to search multiple directories then they should be specified as a colon-separated list
in LD_LIBRARY_PATH.

Alternatives
Using GNU Libtool
Libtool is part of GNU Autotools. Its purpose is to simplify the process of building shared libraries,
particularly those intended for use on multiple platforms. For example, for the scenario described
above you could use the following sequence of commands:
libtool
libtool
libtool
libtool

--mode=compile gcc
--mode=compile gcc
--mode=compile gcc
--mode=link gcc -o

-c foo.c
-c bar.c
-c baz.c
libqux.la foo.lo bar.lo baz.lo -rpath /usr/local/lib -version-info 6:0:5

You may not need to these commands explicitly, because Libtool is often used in conjunction with
Automake which has the ability to generate them automatically, but it is equally suitable for use as
a stand-alone utility if that suits your purpose.
Be aware that Libtool requires the use of a specific numbering scheme for specifying the interface
version (passed using the -version-infooption above), and that this should almost certainly not be
equal to the release version. The Libtool manual describes when and how these values should be
changed.

Further reading
Program Library HOWTO, David A Wheeler
Libtools versioning system, GNU Libtool Manual, GNU Project
Vaughan et al, Library Versioning, GNU Autoconf, Automake and Libtool
ldconfig(8) (Ubuntu manpage)

Capture the output of a child process in C


Content

1 Objective

2 Scenario

3 Method

3.1 Overview

3.2 Create a new pipe using the pipe function

3.3 Connect the entrance of the pipe to STDOUT_FILENO within the child process

3.4 Close the entrance of the pipe within the parent process

3.5 Close the exit from the pipe within the child process

3.6 Sample code

4 Alternatives

4.1 Using O_CLOEXEC to close file descriptors

4.2 Using popen

5 See also

6 Further reading

Tested on
Debian (Etch, Lenny,
Squeeze)
Ubuntu (Hardy, Intrepid,
Jaunty, Karmic, Lucid,
Maverick, Natty, Oneiric,
Precise, Quantal)

Objective
To capture the standard output of a child process in C

Scenario
Suppose that you are writing a program which executes a command as a child process
using fork and exec:
pid_t pid = fork();
if (pid == -1) {
perror("fork");
exit(1);
} else if (pid == 0) {

execl(cmdpath, cmdname, (char*)0);


perror("execl");
_exit(1);
}

The command is expected to write some text to stdout and you wish to capture this output for use
by the parent process.

Method
Overview
The method described here has four steps:
1. Create a new pipe using the

pipe

function.

2. Connect the entrance of the pipe to

STDOUT_FILENO

within the child process.

3. Close the entrance of the pipe within the parent process.


4. Close the exit from the pipe within the child process.
The parent process will then be able to read the output of the child process from the exit of the pipe.
The following header files are used:
Header

Used by

<errno.h>

errno, EINTR

<stdio.h>

perror

<stdlib.h>

exit

<unistd.h>

_exit, close, dup2, execl, fork, pipe, STDOUT_FILENO

<sys/wait.h>

wait, pid_t

Create a new pipe using the pipe function


A pipe is an anonymous first-in, first-out (FIFO) buffer with endpoints presented as file descriptors.
Because these can be owned by different processes, it provides a convenient means for transporting
the output of the child process to the parent process:
int filedes[2];
if (pipe(filedes) == -1) {
perror("pipe");
exit(1);
}

The file descriptor for the entrance to the pipe is written to filedes[1] and the exit to filedes[0].
The former must be transferred to the child process, the latter retained by the parent process. The
simplest way to arrange this is to create the pipe before the child process is forked (thus ensuring
that each process receives a copy of both descriptors).

Connect the entrance of the pipe to STDOUT_FILENO within the child process
When a process forks, the child inherits a set of file descriptors that are copies of those owned by
the parent process. Consequently, if the standard output of the parent process is routed to a
particular terminal device then the same will be true of the child process (in the first instance).
To capture the output of the child process, its standard output must instead be routed into the pipe.
This can be arranged using the dup2command:
while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {}

The effect is to close the file descriptor STDOUT_FILENO if it was previously open, then (re)open it as
a copy of filedes[1]. A loop is needed to allow for the possibility of dup2 being interrupted by a
signal. Once this has been done, filedes[1] can be closed:
close(filedes[1]);

It would be equally acceptable to copy the descriptor onto STDERR_FILENO in order to capture the
standard error stream. To capture both stdoutand stderr you can either create two separate pipes,
or if it is acceptable for the streams to be mixed, copy the same file descriptor onto
bothSTDOUT_FILENO and STDERR_FILENO by calling dup2 twice.

Close the entrance of the pipe within the parent process


The parent process has no need to access the entrance to the pipe, so
within that process too:

filedes[1]

should be closed

close(filedes[1]);

Close the exit from the pipe within the child process
Similarly, the child process has no need to access the exit from the pipe:
close(filedes[0]);

(You should also have made arrangements to close any other file descriptors not needed by the
child process, regardless of whether you want to capture its output.)

Sample code
The code for managing the pipe can be integrated into the existing program as follows:
int filedes[2];
if (pipe(filedes) == -1) {
perror("pipe");
exit(1);
}
pid_t pid = fork();
if (pid == -1) {
perror("fork");
exit(1);
} else if (pid == 0) {
while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {}
close(filedes[1]);
close(filedes[0]);
execl(cmdpath, cmdname, (char*)0);
perror("execl");
_exit(1);
}
close(filedes[1]);

It is then possible for the parent process to read the output of the child process from file
descriptor filedes[0]:
char buffer[4096];
while (1) {
ssize_t count = read(filedes[0], buffer, sizeof(buffer));
if (count == -1) {
if (errno == EINTR) {
continue;
} else {
perror("read");
exit(1);
}
} else if (count == 0) {
break;
} else {
handle_child_process_output(buffer, count);
}
}
close(filedes[0]);
wait(0);

If you need to avoid blocking while waiting for output from the child then this can be arranged
using select, O_NONBLOCK or similar.

Alternatives
Using O_CLOEXEC to close file descriptors
If you want to capture its output then it is quite likely that (as in this example) the child process will
be calling a function from the exec family to transfer control to another program. An alternative
method is then available for closing the pipe exit within the child process, by setting
theO_CLOEXEC flag:
if (fcntl(filedes[0], F_SETFD, FD_CLOEXEC) == -1) {
perror("fcntl");

exit(1);
}

This should be done in the parent process prior to forking. It avoids the need to take any explicit
action within the child process to close the file descriptor, provided that exec is called. This makes
little difference if there is only one file descriptor to close, but when there are many child processes
executing in parallel the benefits are more noticable: one system call is needed instead of many, and
because the flag can be set immediately when the pipe is created there is less risk of file descriptors
being missed.

Using popen
The popen function provides most of the functionality described above in the form of a single
function call:
FILE* fp = popen("pwd", "r");
// ...
int status = pclose(fp);

This is undeniably simpler than constructing the pipework explicitly, but


limiting:

popen

can also be quite

It returns a stdio stream as opposed to a raw file descriptor, which is


unsuitable for handling the output asynchronously.
Rather than executing the command directly, popen typically spawns an
instance of the shell first. This can adversely affect performance, and may
have other undesirable side effects.
It is possible to attach to the standard output of the child process or the
standard input, but not both at the same time.
It does not provide access to the process ID of the child process.
There is no opportunity to modify the context of the child process
before exec is called.
Workarounds are possible for some of these issues, but in the authors experience it is generally
better to accept the minor inconvenience of calling pipe, fork and exec explicitly rather than
attempting a popen-based solution and taking the risk of it later needing to be rewritten.

See also
Reap zombie processes using a SIGCHLD handler

Further reading
pipe, Base Specifications Issue 7, The Open Group, 2008
dup, Base Specifications Issue 7, The Open Group, 2008
Tags: c | posix | process

Reap zombie processes using a SIGCHLD handler


Content

1 Objective

2 Background

3 Scenario

4 Method

4.1 Overview

4.2 Define a handler for SIGCHLD that calls waitpid

4.3 Register the SIGCHLD handler

5 Alternatives

5.1 Explicitly set the SIGCHLD handler to SIG_IGN

5.2 Set the SA_NOCLDWAIT flag

6 See also

7 Further reading

Tested on
Debian (Etch, Lenny,
Squeeze)
Ubuntu (Hardy, Intrepid,
Jaunty, Karmic, Lucid,
Maverick, Natty, Oneiric,
Precise, Quantal)

Objective
To install a SIGCHLD handler for reaping zombie processes

Background
When a child process terminates it does not disappear entirely. Instead it becomes a zombie
process which is no longer capable of executing, but which still has a PID and an entry in the
process table. This is indicated by the state code Z in ps or top.
The presence of a moderate number of zombie processes is not particularly harmful, but they add
unnecessary clutter that can be confusing to the administrator. In extreme cases they could exhaust
the number of available process table slots. For these reasons, well-behaved programs should
ensure that zombie processes are removed in a timely manner.
The process of eliminating zombie processes is known as reaping. The simplest method is to
call wait, but this will block the parent process if the child has not yet terminated. Alternatives are
to use waitpid to poll or SIGCHLD to reap asynchronously. The method described here uses SIGCHLD.

Scenario
Suppose you have written a network server which spawns a separate child process to handle each
connection. The child process terminates itself when the connection closes, without any
involvement from the parent process. It would be unacceptable for the parent process to block,
therefore calling wait immediately after fork is not an option.

Method
Overview
The method described here has two steps:
1. Define a handler for
2. Register the

SIGCHLD

SIGCHLD

that calls

waitpid.

handler.

Note that the signal is named SIGCHLD with an H, as opposed to SIGCLD (which has a similar function,
but potentially different semantics and is non-portable).
The following header files are used:
Header

Used by

<signal.h>

sigaction, sigemptyset, struct sigaction, SIGCHLD, SA_RESTART, SA_NOCLDSTOP

<stdio.h>

perror

<stdlib.h>

exit

<sys/wait.h>

waitpid, pid_t, WNOHANG

Define a handler for SIGCHLD that calls waitpid


The operations that can be safely performed within a signal handler are very limited, but they
include use of the waitpid function:
void handle_sigchld(int sig) {
while (waitpid((pid_t)(-1), 0, WNOHANG) > 0) {}
}

The reason for calling waitpid as opposed to wait is to allow use of the WNOHANG option, which
prevents the handler from blocking. This allows for the possibility of SIGCHLD being raised for
reasons other than the termination of a child process.
(SIGCHLD has three conventional uses: to indicate that a child process has terminated, stopped or
continued. The latter two conditions can be suppressed using SA_NOCLDSTOP as described below, but

that would not prevent a process with the right permissions from raising
using the kill function or an equivalent.)

SIGCHLD

for any reason

The reason for placing waitpid within a loop is to allow for the possibility that multiple child
processes could terminate while one is in the process being reaped. Only one instance
of SIGCHLD can be queued, so it may be necessary to reap several zombie processes during one
invocation of the handler function.
The loop ensures that any zombies which existed prior to invocation of the handler function will be
reaped. If any further zombies come into being after that moment in time then they may or may not
be reaped by that invocation of the handler function (depending on the timing), but they should
leave behind a pending SIGCHLD that will result in the handler being called again.

Register the SIGCHLD handler


The POSIX-recommended method for registering a signal handler is to use the

sigaction

function:

struct sigaction sa;


sa.sa_handler = &handle_sigchld;
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
if (sigaction(SIGCHLD, &sa, 0) == -1) {
perror(0);
exit(1);
}

You should do this before any child processes terminate, which in practice means registering before
any are spawned. (POSIX neither requires nor prohibits SIGCHLD being raised in respect of a child
that had already terminated when the handler was registered, so a program which relied on this
happening might work but would not be portable.)
When an operating system function is interrupted by a signal the default behaviour is to return
immediately (either with the error EINTR, or reporting partial completion if that is possible). This
creates a need for such functions to be wrapped in a loop for the purpose of handling EINTR, which
is both inconvenient and error-prone. Setting the SA_RESTART flag when the signal is registered
makes this unnecessary in most cases, and is recommended unless you have a good reason not to.
Setting the SA_NOCLDSTOP flag prevents SIGCHLD from being raised when a child process stops or
continues (as opposed to terminating). Since our interest is confined to processes that have
terminated, there no harm in this and it may prevent the handler being invoked unnecessarily. It
does not obviate the need to use WNOHANG within the handler because it does not
prevent SIGCHLD from being raised in some other way.

Alternatives
Explicitly set the SIGCHLD handler to SIG_IGN
If (as in the example above) the signal handler does nothing beyond calling waitpid then an
alternative is available. Setting the SIGCHLD handler to SIG_IGN will cause zombie processes to be
reaped automatically:
struct sigaction sa;
sa.sa_handler = SIG_IGN;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
if (sigaction(SIGCHLD, &sa, 0) == -1) {
perror(0);
exit(1);
}

This can be implemented portably and somewhat more concisely with the
prefer:

signal

function if you

if (signal(SIGCHLD, SIG_IGN) == SIG_ERR) {


perror(0);
exit(1);
}

Note that it is not sufficient for SIGCHLD to have a disposition that causes it to be ignored (as the
default, SIG_DFL, would do): it is only by setting it to SIG_IGN that this behaviour is obtained.
One drawback of this method is that it is slightly less portable than explicitly calling waitpid: the
behaviour it depends on is required by POSIX.1-2001, and previously by the Single Unix
Specification, but not by POSIX.1-1990.

Set the SA_NOCLDWAIT flag


Another way to achieve the same outcome is to set the SA_NOCLDWAIT flag when installing the signal
handler:
struct sigaction sa;
sa.sa_handler = &handle_sigchld;
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_RESTART | SA_NOCLDSTOP | SA_NOCLDWAIT;
if (sigaction(SIGCHLD, &sa, 0) == -1) {
perror(0);
exit(1);
}

Unfortunately this is not as useful as it could be, because it is implementation-defined


whether SIGCHLD is raised in response to process termination when SA_NOCLDWAIT is set. Since you
cannot rely on the handler function being invoked, it follows that the handler cannot actually do
anything if you want its behaviour to be portable. At that point you may as well set the handler
to SIG_IGN, in which case there is arguably no need to set SA_NOCLDWAIT.

There is one small advantage to using SA_NOCLDWAIT: if it is supported at all then you can be
reasonably confident that it will have the desired behaviour, whereas for SIG_IGN this is assured
only if the operating system declares conformance to an appropriate version of POSIX or SUS.

See also
Capture the output of a child process in C

Further reading
wait, waitpid, Base Specifications Issue 7, The Open Group, 2008
<signal.h>, Base Specifications Issue 7, The Open Group, 2008

Calculate an Internet Protocol checksum in C


Content

1 Objective

2 Background

3 Scenario

4 Method

4.1 Overview

4.2 Implementation (optimised for clarity)

4.3 Implementation (optimised for speed)

5 Testing

6 Variations

6.1 Verifying a checksum

6.2 Avoiding the use of memcpy

6.3 Omitting the conversion between network and host byte order

7 Further reading

Tested on
Debian (Lenny)

Objective
To calculate an Internet Protocol checksum in C

Background
RFC 791 defines the following checksum algorithm for use when constructing the header of an
IPv4 datagram:
The checksum field is the 16 bit one's complement of the one's complement
sum of all 16 bit words in the header. For purposes of computing the
checksum, the value of the checksum field is zero.
The same algorithm is used by a number of other IP-based protocols including TCP, UDP and
ICMP. Implementation techniques are discussed in RFC 1071, RFC 1141 and RFC 1624.

Scenario
Suppose that you wish to send an ICMP echo request using a raw socket. Like all ICMP messages
this contains a checksum that is calculated using the algorithm described above. Given the message
to be sent, you wish to calculate the required checksum.

Method
Overview
The checksum can be calculated using the following algorithm:
1. Set the checksum field to zero.
2. Pad the data to an even number of bytes.
3. Reinterpret the data as a sequence of 16-bit unsigned integers that are in
network byte order.
4. Calculate the sum of the integers, subtracting
reaches 0x10000 or greater.

0xffff

whenever the result

5. Calculate the bitwise complement of the sum. This is the required value
of the checksum field.
Ones complement notation has two representations for the number zero: normal zero ( 0x0000 in
this case) and negative zero (0xffff). It is not completely clear how these should be handled:
RFC 791 states only that ones complement arithmetic should be used,
and does not address the question of how zero is represented.

The incremental algorithm recommended by RFC 1624 always prefers


normal zero over negative zero, and the text makes clear that this was an
explicit design goal.
The non-incremental algorithm described in 4.1 of RFC 1071 behaves
similarly, except in the special case where the data is all zeros (which can
never occur in a valid IP datagram header). It would not feasible for an
incremental algorithm to replicate this idiosyncrasy.
In the interests of consistency, the implementations described here prefer normal zero over negative
zero in all cases (even where the data is all zeros). This is achieved by initialising the accumulated
sum to negative zero (0xffff), which makes no difference to the final result except in the case
where nothing is added to it.
To exactly replicate the behaviour of the example given in RFC 1071, the accumulator should
instead be initialised to normal zero (0x0000).

Implementation (optimised for clarity)


Here is a near-literal implementation of the algorithm described above:
uint16_t ip_checksum(void* vdata,size_t length) {
// Cast the data pointer to one that can be indexed.
char* data=(char*)vdata;
// Initialise the accumulator.
uint32_t acc=0xffff;
// Handle complete 16-bit blocks.
for (size_t i=0;i+1<length;i+=2) {
uint16_t word;
memcpy(&word,data+i,2);
acc+=ntohs(word);
if (acc>0xffff) {
acc-=0xffff;
}
}
// Handle any partial block at the end of the data.
if (length&1) {
uint16_t word=0;
memcpy(&word,data+length-1,1);
acc+=ntohs(word);
if (acc>0xffff) {
acc-=0xffff;
}
}
// Return the checksum in network byte order.
return htons(~acc);
}

The data should be passed to the function in network byte order with the checksum field already
zeroed. The result is returned in network byte order, so is ready to be written directly into the
checksum field.

If there is an odd byte at the end of the data then this is treated as a special case so that padding can
be done on the fly. The calls to memcpy are needed to avoid breaking the strict aliasing rules, which
prevent an arbitrary type from being safely cast to a uint16_t.

Implementation (optimised for speed)


The following implementation uses two techniques to improve performance:
deferring carries until the end of the calculation by allowing the
accumulator to exceed 0xfff, and
performing multiple additions in parallel.
uint16_t ip_checksum(void* vdata,size_t length) {
// Cast the data pointer to one that can be indexed.
char* data=(char*)vdata;
// Initialise the accumulator.
uint64_t acc=0xffff;
// Handle any partial block at the start of the data.
unsigned int offset=((uintptr_t)data)&3;
if (offset) {
size_t count=4-offset;
if (count>length) count=length;
uint32_t word=0;
memcpy(offset+(char*)&word,data,count);
acc+=ntohl(word);
data+=count;
length-=count;
}
// Handle any complete 32-bit blocks.
char* data_end=data+(length&~3);
while (data!=data_end) {
uint32_t word;
memcpy(&word,data,4);
acc+=ntohl(word);
data+=4;
}
length&=3;
// Handle any partial block at the end of the data.
if (length) {
uint32_t word=0;
memcpy(&word,data,length);
acc+=ntohl(word);
}
// Handle deferred carries.
acc=(acc&0xffffffff)+(acc>>32);
while (acc>>16) {
acc=(acc&0xffff)+(acc>>16);
}
// If the data began at an odd byte address
// then reverse the byte order to compensate.
if (offset&1) {
acc=((acc&0xff00)>>8)|((acc&0x00ff)<<8);
}
// Return the checksum in network byte order.
return htons(~acc);
}

The maximum length of message that can be processed by this function is limited to approximately
16 gigabytes by the number of deferred carries that can be accumulated. In this unlikely event that
this is insufficient then the upper half of the accumulator can be folded into the lower half as often
as is necessary to prevent an overflow. This is more likely to be required when processing 16-bit
blocks using a 32-bit accumulator, in which case only 128 kilobytes can be processed without the
risk of overflow.

Testing
Here is an example of how an 8-byte ICMP echo request might be constructed using
the icmphdr structure type provided by glibc:
struct icmphdr req;
req.type=8;
req.code=0;
req.checksum=0;
req.un.echo.id=htons(0x1234);
req.un.echo.sequence=htons(1);
req.checksum=ip_checksum(&req,8);

The resulting message, as a hexadecimal byte stream, should be as follows:


08 00 E5 CA 12 34 00 01

Variations
Verifying a checksum
There are two ways in which checksums of the type described here can be verified:
by calculating what the checksum should be using the normal method,
then comparing this to the value received, or
by calculating the checksum without first zeroing the checksum field,
then comparing this with normal zero (0x0000).
The second method is likely to be simpler, quicker and more convenient in most cases. If you
should decide to use the first method then some care is needed with regard to negative and normal
zero. RFC 1624 recommends that either be accepted (in accordance with the robustness principle:
be conservative in what you send, liberal in what you accept). This can be achieved by normalising
the received checksum before performing the comparison.
(No special action is required when using the first method, provided that the checksum algorithm
used to perform the verification consistently returns normal zero in preference to negative zero. A
minor optimisation would be to omit the final inversion and compare the accumulator with negative
zero.)

Avoiding the use of memcpy


If the data were presented to the checksum function as an array of uint16_t then the calls
to memcpy could be omitted. There are two ways to achieve this. The safer method is to assemble the
message within a union:
union {
uint16_t words[740];
struct icmphdr icmp;
} message;

This is allowed by C99, but not by C89 or C++. It has the disadvantage that the union must be
constructed by the caller if copying is to be avoided, and this may not always be practicable.
The alternative is to reinterpret the data by means of a type cast. This would not normally be safe in
any variant of C or C++, and would be quite likely to fall foul of the aliasing rules that are specified
by C99. However in some compilation environments it can be made safe (or at least, less unsafe)
by disabling strict application of the aliasing rules. In the case of GCC this is done using the -fnostrict-aliasing option or themay_alias attribute.
It should be noted that the removal of memcpy will not necessarily improve the performance of of the
checksum function because the compiler may already be able to achieve the same result without
assistance. For example, GCC can do this in some cases when optimisation is enabled. It would be
advisable to determine whether there is any benefit to be gained before making non-portable
changes to the source code.

Omitting the conversion between network and host byte order


The checksum algorithm described here has the property that it works equally well when the upper
and lower halves of each 16-bit block are reversed. For example, applying it to the sequence:
0x4500, 0x001c, 0x03de, 0x0000, 0x4001, 0x0000, 0x7f00, 0x0001, 0x7f00, 0x0001

gives a checksum of 0x7901, whereas applying it to:


0x0045, 0x1c00, 0xde03, 0x0000, 0x0140, 0x0000, 0x007f, 0x0100, 0x007f, 0x0100

gives 0x0179. This due to the carry from the most significant byte of each block being fed back into
the least significant byte and vice versa. It might therefore appear that the calls
to ntohs and htons made above are redundant. This is almost, but not quite, correct.
The usual behaviour of ntohs is to either do nothing or reverse the byte order. In either of these
cases the calls to ntohs and htons cancel out and could be removed. However POSIX states quite
clearly that an arbitrary rearrangement of the bit pattern could occur, so if you want to be certain

that the algorithm will behave as intended then an explicit conversion to host byte order is
necessary.

Further reading
J. Postel, Internet Protocol - DARPA Internet Program Protocol
Specification, STD 5, RFC 791, DARPA, September 1981
R. Braden, D. Borman and C. Partridge, Computing the Internet
Checksum, RFC 1071, September 1988
T Mallory and A. Kullberg, Incremental Updating of the Internet
Checksum, RFC 1141, January 1990
A. Rijsinghani, Computation of the Internet Checksum via Incremental
Update, RFC 1624, May 1994

Send a UDP datagram in C


Content

1 Objective

2 Scenario

3 Method

3.1 Overview

3.2 Construct the remote socket address

3.3 Create the client socket.

3.4 Send the datagram

3.5 Send the datagram (using sendto)

3.6 Send the datagram (using sendmsg)

4 Variations

4.1 Sending to the IPv4 broadcast address

4.2 Replying to a datagram

4.3 Connecting to a remote host

5 See also

6 Further Reading

Tested on
Debian (Lenny)

Objective
To send an outbound UDP datagram in C

Scenario
Suppose that you wish to write a client that implements the UDP-based variant of the Daytime
Protocol, as defined by RFC 867
This is a very simple protocol whereby the client sends a datagram to the server, then the server
responds with a datagram containing a human-readable copy of the current date and time. The
datagram from the client is not required to have any particular content.

Method
Overview
The method described here has three steps:
1. Construct the remote socket address.
2. Create a UDP socket.
3. Send the datagram.
The following header files will be needed:
#include
#include
#include
#include
#include
#include

<errno.h>
<string.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>

and if using sendmsg to send the datagram:


#include <sys/uio.h>

Construct the remote socket address


To send a UDP datagram it is necessary to specify the remote IP address and port number to which
the connection should be directed. The combination of these two values is treated as a single entity
called the socket address, which is represented by a struct sockaddr_in for IPv4 or a struct
sockaddr_in6 for IPv6.
A local socket address may also be specified, however it is rarely necessary to do so. By default the
local address is chosen automatically by the network stack.
Most common network services have an assigned port number on which they are normally
expected to listen. It makes sense for the client to use this as a default, however it is important that
an alternative can be selected. The user of the client will not necessarily have any control over how
the server is configured, so the onus is on the client software to provide access to whichever port
the server has been instructed to use.
It is often useful for the remote IP address to default to the loopback address, particularly for
services such as databases where there is a good chance of the client and server being run on the
same machine. Alternatively, it may be preferable to require that the destination be specified
explicitly.
For most purposes the best way to construct the remote address is by calling getaddrinfo. This
takes a string containing either a hostname or an IP address, and a second string containing either a
service name or a port number. These are converted into a sockaddr_in or a sockaddr_in6 as
appropriate:
const char* hostname=0; /* localhost */
const char* portname="daytime";
struct addrinfo hints;
memset(&hints,0,sizeof(hints));
hints.ai_family=AF_UNSPEC;
hints.ai_socktype=SOCK_DGRAM;
hints.ai_protocol=0;
hints.ai_flags=AI_ADDRCONFIG;
struct addrinfo* res=0;
int err=getaddrinfo(hostname,portname,&hints,&res);
if (err!=0) {
die("failed to resolve remote socket address (err=%d)",err);
}

The hints argument contains additional information to help guide the conversion. In this example:
The address family has been left unspecified so that both IPv4 and IPv6
addresses can be returned. In principle you could receive results for other
address families too: you can either treat this as a feature, or filter out
any unwanted results after the call to getaddrinfo.
The socket type has been constrained to SOCK_DGRAM. This allows UDP
but excludes TCP.

The protocol has been left unspecified because it is only meaningful in


the context of a specific address family. If the address family had been set
to AF_INET or AF_INET6 then this field could have been set to
IPPROTO_TCP (but it is equally acceptable to leave it set to zero).
The AI_PASSIVE flag has not been set because the result is intended for
use as a remote address, not as a local address. This causes the IP
address to default to the loopback address (as opposed to the wildcard
address).
The AI_ADDRCONFIG flag has been set so that IPv6 results will only be
returned if the server has an IPv6 address, and similarly for IPv4.
The res argument is used to return a linked list of addrinfo structures containing the address or
addresses that were found. If multiple records are returned then the recommended behaviour
(from RFC 1123) is to try each address in turn, stopping when a successful outcome is achieved.
This assumes that you have some way to distinguish success from failure, which may not always be
the case, but if you are able to do this then you should. If not then an acceptable alternative is to use
the first result and discard the remainder.
The memory occupied by the result list should be released by calling freeaddrinfo once it is no
longer needed, however this cannot be done until after the datagram has been sent.

Create the client socket.


The socket that will be used to send the datagram should be created using the
takes three arguments:

socket

function. This

1. the domain (AF_INET or AF_INET6 in this case, corresponding to IPv4 or


IPv6 respectively),
2. the socket type (SOCK_DGRAM in this case, meaning that the socket
should provide connectionless and potentially unreliable transfer of
datagrams), and
3. the protocol (IPROTO_UDP in this case, corresponding to UDP).
A value of 0 for the protocol requests the default for the given address family and socket type,
which for AF_INET or AF_INET6 and SOCK_DGRAMwould be IPPROTO_UDP. It is equally acceptable for the
protocol to be deduced in this manner or specified explicitly.
Assuming you previously used getaddrinfo to construct the remote address then the required
values can be obtained from the addrinfostructure:

int fd=socket(res->ai_family,res->ai_socktype,res->ai_protocol);
if (fd==-1) {
die("%s",strerror(errno));
}

Send the datagram


Datagrams can be sent using any function that is capable of writing to a file descriptor, however
unless you have connected the socket to a particular remote address (as described below) it is
necessary to use either sendto or sendmsg so that a destination address can be specified. Of
these sendmsg is the more flexibile option, but at the cost of a signficiantly more complex interface.
Details for each function are given below.
Regardless of which function you choose, each function call will result in a separate datagram
being sent. For this reason you must either compose each datagram payload as a single, contiguous
block of memory, or make use of the scatter/gather capability provided by sendmsg.

Send the datagram (using sendto)


To call sendto you must supply the content of the datagram and the remote address to which it
should be sent:
if (sendto(fd,content,sizeof(content),0,
res->ai_addr,res->ai_addrlen)==-1) {
die("%s",strerror(errno));
}

The fourth argument is for specifying flags which modify the behaviour of sendto, none of which
are needed in this example.
The value returned by sendto is the number of bytes sent, or -1 if there was an error. UDP
datagrams are sent atomically, so unlike when writing to a TCP socket there is no need to wrap the
function call in a loop to handle partially-sent data.

Send the datagram (using sendmsg)


To call sendmsg, in addition to the datagram content and remote address you must also construct
an iovec array and a msghdr structure:
struct iovec iov[1];
iov[0].iov_base=content;
iov[0].iov_len=sizeof(content);
struct msghdr message;
message.msg_name=res->ai_addr;
message.msg_namelen=res->ai_addrlen;
message.msg_iov=iov;
message.msg_iovlen=1;
message.msg_control=0;
message.msg_controllen=0;

if (sendmsg(fd,&message,0)==-1) {
die("%s",strerror(errno));
}

The purpose of the iovec array is to provide a scatter/gather capability so that the datagram payload
need not be stored in a contiguous region of memory. In this example the entire payload is stored in
a single buffer, therefore only one array element is needed.
The msghdr structure exists to bring the number of arguments to recvmsg and sendmsg down to a
managable number. On entry to sendmsg it specifies where the destination address, the datagram
payload and any ancillary data are stored. In this example no ancillary data has been provided.
If you wish to pass any flags into sendmsg then this cannot be done using msg_flags, which is
ignored on entry. Instead you must pass them using the third argument to sendmsg (which is zero in
this example).

Variations
Sending to the IPv4 broadcast address
By default, attempts to send a datagram to the broadcast address are rejected with an error
(typically EACCES, however it is not obvious from the POSIX specification which error should
occur). This is a safety measure intended to reduce the risk of making unintended broadcasts. It can
be overridden by setting the SO_BROADCAST socket option:
int broadcast=1;
if (setsockopt(fd,SOL_SOCKET,SO_BROADCAST,
&broadcast,sizeof(broadcast))==-1) {
die("%s",strerror(errno));
}

Replying to a datagram
When replying to a UDP datagram the response should normally be sent to the IP address and port
number from which the request originated. This can be arranged by capturing the source address of
the request using recvfrom or recvmsg, then passing it to sendto or sendmsg as the destination
address for the response.
There is also the question of where the response should be sent from. In most cases the best choice
will be from the port and IP address to which the request was directed. This is not a requirement of
the User Datagram Protocol itself, however there are several reasons why it is desirable:
Generic firewalls and NAT gateways normally use both source and
destination port numbers and IP addresses for connection tracking (as
per RFC 2663) so will fail to associate the response with the request if it is
not sent from the appropriate port and IP address.

The behaviour of the connect function in relation to UDP strongly


encourages the assumption that any response will originate from a
matching IP address and port number. When a UDP socket is in the
connected state, datagrams from any other source are rejected.
RFC 1123 recommends (but does not require) that when replying to a
UDP datagram on a multihomed host, the response should be sent from
the IP address to which the request was directed.
Some application-layer protocols (such as DNS) explicitly require that
replies be sent from a matching port.
An exception would be where the application-layer protocol explicitly requires or allows the
response to originate from a different port (for example, as is the case for TFTP).
Replying from a matching port number can be achieved very easily by sending the response using
the socket that received the request. This method will reply from a matching IP address if the socket
is bound to a specific address, but not necessarily if it is bound to the wildcard address and the
server is multihomed.
Unfortunately the POSIX API does not provide a satisfactory way to reply from a matching IP
address in a portable manner. Briefly, the available options include:
using a non-portable mechanism such as IP_PKTINFO or the combination
of IP_RECVDSTADDR and IP_SENDSRCADDR to obtain and set the local IP address,
binding a separate socket to each local IP address, having non-portably
obtained a list of addresses using a mechanism such asSIOCGIFCONF, or
sending the response from the wildcard address in cases where use of a
matching address is non-mandatory, accepting that there are some use
cases in which this will fail.
This is a substantial topic in its own right and will be the subject of a future microHOWTO.

Connecting to a remote host


When exchanging many datagrams from a particular remote host it may be beneficial for a UDP
socket to be connected to that host. This removes the need for the remote address to be explicitly
checked every time a datagram is received, and for the address to be specified every time one is
sent. The connection is made using the connect function:
if (connect(fd,remote_addr,sizeof(remote_addr))==-1) {
die("%s",strerror(errno));

This is superficially identical to the call that would be made to establish a TCP connection,
however unlike TCP there is no handshake. This has two notable consequences:
Calling connect on a UDP socket does not (by itself) result in any network
activity.
The call to connect will succeed even if the remote machine is
unreachable or nonexistant.
A UDP socket in the connected state will only receive datagrams that originate from the given
remote address. It is therefore feasible to use functions such as read or recv in place of recvfrom.
Similarly the given remote address becomes the default for outgoing datagrams, therefore it is
feasible to use write or send in place of sendto. (Being connected does not, however, prevent you
from sending datagrams to arbitrary destinations using sendto if you so wish.)

See also
Listen for and receive UDP datagrams in C
Establish a TCP connection in C
Send an arbitrary IPv4 datagram using a raw socket in C

Further Reading
W. Richard Stevens et al, Unix Network Programming, Volume 1: The
Sockets Networking API, 3rd edition, Addison-Wesley, 2003
The Open Group, sendto, Base Specifications Issue 6
The Open Group, sendmsg, Base Specifications Issue 6

Listen for and receive UDP datagrams in C


Content

1 Objective

2 Scenario

3 Method

3.1 Overview

3.2 Construct the local socket address

3.3 Create the socket.

3.4 Bind the local address to the socket

3.5 Receive and handle datagrams as they arrive

3.6 Receive and handle datagrams as they arrive using recvfrom

3.7 Receive and handle datagrams as they arrive using recvmsg

4 Variations

4.1 Listening for a reply

4.2 Connecting to a remote host

4.3 Determining the local address

5 See also

6 Further Reading

Tested on
Debian (Lenny)
Ubuntu (Lucid)

Objective
To listen for and receive inbound UDP datagrams in C

Scenario
Suppose that you wish to write a server that implements the UDP-based variant of the Daytime
Protocol, as defined by RFC 867
This is a very simple protocol whereby the client sends a datagram to the server, then the server
responds with a datagram containing a human-readable copy of the current date and time. The
datagram from the client is not required to have any particular content.

Method
Overview
The method described here has four steps:
1. Construct the local socket address.
2. Create the socket.
3. Bind the local address to the socket.
4. Receive and handle datagrams as they arrive.
This is the appropriate procedure when listening for unsolicited datagrams, as in the scenario
described above. See below for how it can be adapted to:
listening for a reply to a datagram that you have sent, or
exchanging many datagrams with a particular remote host.
The following header files will be needed:
#include
#include
#include
#include
#include
#include

<errno.h>
<string.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>

and if using recvmsg to receive datagrams:


#include <sys/uio.h>

Construct the local socket address


In order to listen for UDP datagrams it is necessary to choose a port number and, optionally, a local
IP address on which to listen. The combination of these two values is treated as a single entity
called the socket address, which is represented by a struct sockaddr_in for IPv4 or a struct
sockaddr_in6 for IPv6.
Most common network services have an assigned port number on which they are normally
expected to listen. While it makes sense to use this as the default, it is good practice to make the
port number configurable. Possible reasons for wanting to override the assigned port number
include:
running multiple instances of a network service on the same machine,

running a network service that would normally use a well-known port


number from a non-root account, or
making port scanning more time-consuming than it would be if the
standard port number were used.
The local IP address should normally default to either the the wildcard address or the loopback
address, but like the port number it is good practice to make it configurable. When a service is
bound to a particular IP address it will only accept connections directed to that address, whereas
when bound to the wildcard address it will accept connections to any local address. Binding to the
loopback address has the effect of prohibiting connections from other machines.
For most purposes the best way to construct the socket address is by calling getaddrinfo. This takes
a string containing the IP address and a string containing the port number, and converts them into
a sockaddr_in or a sockaddr_in6 as appropriate. It is also able to resolve hostnames and service
names:
const char* hostname=0; /* wildcard */
const char* portname="daytime";
struct addrinfo hints;
memset(&hints,0,sizeof(hints));
hints.ai_family=AF_UNSPEC;
hints.ai_socktype=SOCK_DGRAM;
hints.ai_protocol=0;
hints.ai_flags=AI_PASSIVE|AI_ADDRCONFIG;
struct addrinfo* res=0;
int err=getaddrinfo(hostname,portname,&hints,&res);
if (err!=0) {
die("failed to resolve local socket address (err=%d)",err);
}

The hints argument contains additional information to help guide the conversion. In this example:
The address family has been left unspecified so that both IPv4 and IPv6
addresses can be returned. In principle you could receive results for other
address families too: you can either treat this as a feature, or filter out
any unwanted results after the call to getaddrinfo.
The socket type has been constrained to
excludes TCP.

SOCK_DGRAM.

This allows UDP but

The protocol has been left unspecified because it is only meaningful in


the context of a specific address family. If the address family had been set
to AF_INET or AF_INET6 then this field could have been set to IPPROTO_UDP (but it
is equally acceptable to leave it set to zero).
The AI_PASSIVE flag has been set because the address is intended for use
by a server. It causes the IP address to default to the wildcard address as
opposed to the loopback address.

The AI_ADDRCONFIG flag has been set so that IPv6 results will only be
returned if the server has an IPv6 address, and similarly for IPv4.
The res argument is used to return a linked list of addrinfo structures containing the address or
addresses that were found. If the network service daemon has the ability to listen on multiple
sockets then it should open one for each address in the list. Otherwise it is considered acceptable to
use the first result and discard the remainder.
The memory occupied by the result list should be released by calling freeaddrinfo once it is no
longer needed, however this cannot be done until after the socket has been created and bound.

Create the socket.


The socket that will be used to listen for inbound datagrams should be created using
the socket function. This takes three arguments:
1. the domain (AF_INET or AF_INET6 in this case, corresponding to IPv4 or
IPv6 respectively),
2. the socket type (SOCK_DGRAM in this case, meaning that the socket
should provide connectionless and potentially unreliable transfer of
datagrams), and
3. the protocol (IPROTO_UDP in this case, corresponding to UDP).
A value of 0 for the protocol requests the default for the given address family and socket type,
which for AF_INET or AF_INET6 and SOCK_DGRAMwould be IPPROTO_UDP. It is equally acceptable for the
protocol to be deduced in this manner or specified explicitly.
Assuming you previously used getaddrinfo to construct the remote address then the required
values can be obtained from the addrinfostructure:
int fd=socket(res->ai_family,res->ai_socktype,res->ai_protocol);
if (fd==-1) {
die("%s",strerror(errno));
}

Bind the local address to the socket


As noted previously, the server socket must be bound to a local address before it can listen for
inbound datagrams. This should be done using the bind function:
if (bind(fd,res->ai_addr,res->ai_addrlen)==-1) {
die("%s",strerror(errno));
}

The first argument is the socket descriptor. The second and third arguments are the local address
and its length.
If the local address was constructed using getaddrinfo then the memory occupied by the address
list can now be released:
freeaddrinfo(res);

(If the address list has been searched or filtered then take care that it is the head of the list that is
released, not the address that you have chosen to use.)

Receive and handle datagrams as they arrive


Datagrams can be received using any function that is capable of reading from a file descriptor,
however if you are listening for unsolicited datagrams (as in this example) then you will normally
want to know where each datagram originated from so that it can be replied to. This information is
provided by the functions recvfrom and recvmsg. Of these recvmsg is the more flexible option, but at
the cost of a significantly more complex interface. Details for each function are given below.
Regardless of which function you choose you will need to supply a buffer to receive the data. If this
is too small to accommodate a complete datagram then any excess is discarded. That means you
need not be concerned about tracking datagram boundaries, because the first byte returned by a
read operation will always be the start of a datagram. However it does raise two issues: how the
buffer size should be chosen, and how any overflow can be detected.
UDP-based application-layer protocols often limit the size of datagram that can be sent in order to
provide an solution to the first issue. For example, TFTP and DNS each have a fixed maximum
payload size of 512 bytes. For DHCP the limit defaults to 548 bytes, but a larger value can be
negotiated if both parties are willing to support it.
In the absence of such guidance it is necessary to consider what the transport, network and link
layer protocols are likely to support. The maximum payload size for UDP over IPv4 is 65507 bytes,
and for IPv6 with jumbogram support it is close to 4 gigabytes. However, the largest payload that
an implementation is required to support is 548 bytes for IPv4 and 1452 bytes for IPv6. On an
Ethernet with the standard MTU of 1500 bytes, the largest payload that can be sent without
fragmentation is 1472 bytes. On this basis, 1472 bytes would be a reasonable choice if you have no
reason to believe that a larger buffer is needed or that a smaller buffer would suffice.
It is possible to receive arbitrary-length datagrams with assistance from the MSG_PEEK option,
however if you choose to do this then it would be prudent to set an upper limit in order to prevent
denial of service attacks.

The recvmsg function explicitly reports truncation by setting the MSG_TRUNC flag in
the msg_flags member of the message header. Alternatively, truncation can be detected when using
any of the available functions by providing a buffer that is one byte longer than the largest payload
that you actually wish to receive, then interpreting a full buffer as a truncated datagram.

Receive and handle datagrams as they arrive using recvfrom


To call recvfrom you need a buffer for the datagram and a buffer for the remote address:
char buffer[549];
struct sockaddr_storage src_addr;
socklen_t src_addr_len=sizeof(src_addr);
ssize_t count=recvfrom(fd,buffer,sizeof(buffer),0,(struct sockaddr*)&src_addr,&src_addr_len);
if (count==-1) {
die("%s",strerror(errno));
} else if (count==sizeof(buffer)) {
warn("datagram too large for buffer: truncated");
} else {
handle_datagram(buffer,count);
}

The fourth argument is for specifying flags which modify the behaviour of recvfrom, none of which
are needed in this example.
The value returned by recvfrom is the number of bytes received, or -1 if there was an error.
Truncation is detected in this example using the technique described above of providing a slightly
over-sized datagram buffer.

Receive and handle datagrams as they arrive using recvmsg


To call recvmsg, in addition to buffers for the datagram and remote address you must also construct
an iovec array and a msghdr structure:
char buffer[548];
struct sockaddr_storage src_addr;
struct iovec iov[1];
iov[0].iov_base=buffer;
iov[0].iov_len=sizeof(buffer);
struct msghdr message;
message.msg_name=&src_addr;
message.msg_namelen=sizeof(src_addr);
message.msg_iov=iov;
message.msg_iovlen=1;
message.msg_control=0;
message.msg_controllen=0;
ssize_t count=recvmsg(fd,&message,0);
if (count==-1) {
die("%s",strerror(errno));
} else if (message.msg_flags&MSG_TRUNC) {
warn("datagram too large for buffer: truncated");
} else {
handle_datagram(buffer,count);
}

The purpose of the iovec array is to provide a scatter/gather capability so that the datagram payload
need not be stored in a contiguous region of memory. In this example the entire payload is stored in
a single buffer, therefore only one array element is needed.
The msghdr structure exists to bring the number of arguments to recvmsg and sendmsg down to a
managable number. On entry to recvmsg it specifies where the source address, the datagram payload
and any ancillary data should be stored. In this example no ancillary data has been requested,
therefore no provision has been made for receiving any.
The msg_flags field of the msghdr structure is used by recvmsg to return flags to the caller. These
include the MSG_TRUNC flag, which on exit will be set if the datagram was truncated or clear if it was
not. If you wish to pass any flags into recvmsg then this cannot be done using msg_flags, which is
ignored on entry. Instead you must pass them using the third argument to recvmsg (which is zero in
this example).

Variations
Listening for a reply
When listening for a reply to a datagram that you have sent then three of the four steps listed above
may be omitted:
You can (and normally should) listen for the reply using the same socket
from which the request was sent.
The act of sending the request will have bound the socket to an unused
port number. This will have been used as the source of the request, so
should match the destination of the reply. The socket is therefore
correctly bound to receive the reply.

Connecting to a remote host


When exchanging many datagrams from a particular remote host it may be beneficial for a UDP
socket to be connected to that host. This removes the need for the remote address to be explicitly
checked every time a datagram is received, and for the address to be specified every time one is
sent. The connection is made using the connect function:
if (connect(fd,remote_addr,sizeof(remote_addr))==-1) {
die("%s",strerror(errno));
}

This is superficially identical to the call that would be made to establish a TCP connection,
however unlike TCP there is no handshake. This has two notable consequences:

Calling connect on a UDP socket does not (by itself) result in any network
activity.
The call to connect will succeed even if the remote machine is unreachable
or nonexistant.
A UDP socket in the connected state will only receive datagrams that originate from the given
remote address. It is therefore feasible to use functions such as read or recv in place of recvfrom.
Similarly the given remote address becomes the default for outgoing datagrams, therefore it is
feasible to use write or send in place of sendto. (Being connected does not, however, prevent you
from sending datagrams to arbitrary destinations using sendto if you so wish.)

Determining the local address


When replying to a datagram on a multihomed host, RFC 1123 recommends that the source address
of the reply should match the destination address of the corresponding request. Unfortunately the
POSIX API does not provide a satisfactory way to achieve this in a portable manner. Briefly, the
available options include:
using a non-portable mechanism to obtain the address, such
as IP_RECVDSTADDR or IP_PKTINFO, if one is available,
binding a separate socket to each local IP address, having non-portably
obtained a list of addresses using a mechanism such asSIOCGIFCONF, or
sending the response from the wildcard address in cases where use of a
maching address is non-mandatory, accepting that there are some use
cases in which this will fail.
This is a substantial topic in its own right and will be the subject of a future microHOWTO.

See also
Send a UDP datagram in C
Listen for and accept TCP connections in C

Further Reading
W. Richard Stevens et al, Unix Network Programming, Volume 1: The
Sockets Networking API, 3rd edition, Addison-Wesley, 2003
The Open Group,

recvfrom,

Base Specifications Issue 6

The Open Group,

recvmsg,

Base Specifications Issue 6

Establish a TCP connection in C


Content

1 Objective

2 Scenario

3 Method

3.1 Overview

3.2 Construct the remote socket address

3.3 Create the client socket

3.4 Connect the socket to the remote address.

4 See also

5 Further Reading

Tested on
Debian (Lenny)
Ubuntu (Precise)

Objective
To establish an outbound TCP connection in C

Scenario
Suppose that you wish to write a client that implements the TCP-based variant of the Daytime
Protocol, as defined by RFC 867
This is a very simple protocol whereby the server sends a human-readable copy of the current date
and time then closes the connection. The client is not required to send any data, and anything it
does send is ignored.

Method
Overview
The method described here has three steps:

1. Construct the remote socket address.


2. Create the client socket.
3. Connect the socket to the remote address.
The following header files will be needed:
#include
#include
#include
#include
#include
#include

<errno.h>
<string.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>

Construct the remote socket address


To establish an outbound TCP connection it is necessary to specify the remote IP address and port
number to which the connection should be directed. The combination of these two values is treated
as a single entity called the socket address, which is represented by a struct sockaddr_in for IPv4
or a struct sockaddr_in6 for IPv6.
(A local socket address may also be specified, however it is rarely necessary to do so. By default
the local address is chosen automatically by the network stack.)
Most common network services have an assigned port number on which they are normally
expected to listen. It makes sense for the client to use this as the default, however it is important
that an alternative can be selected. The user of the client will not necessarily have any control over
how the server is configured, so the onus is on the client software to provide access to whichever
port the server has been instructed to use.
It is often useful for the remote IP address to default to the loopback address, particularly for
services such as databases where there is a good chance of the client and server being run on the
same machine. Alternatively, it is sometimes preferable to require that the destination be specified
explicitly.
For most purposes the best way to construct the remote address is by calling getaddrinfo. This
takes a string containing either a hostname or an IP address, and a second string containing either a
service name or a port number. These are converted into a sockaddr_in or a sockaddr_in6 as
appropriate:
const char* hostname=0; /* localhost */
const char* portname="daytime";
struct addrinfo hints;
memset(&hints,0,sizeof(hints));
hints.ai_family=AF_UNSPEC;
hints.ai_socktype=SOCK_STREAM;

hints.ai_protocol=0;
hints.ai_flags=AI_ADDRCONFIG;
struct addrinfo* res=0;
int err=getaddrinfo(hostname,portname,&hints,&res);
if (err!=0) {
die("failed to resolve remote socket address (err=%d)",err);
}

The hints argument contains additional information to help guide the conversion. In this example:
The address family has been left unspecified so that both IPv4 and IPv6
addresses can be returned. In principle you could receive results for other
address families too: you can either treat this as a feature, or filter out
any unwanted results after the call to getaddrinfo.
The socket type has been constrained to SOCK_STREAM. This allows TCP
but excludes UDP.
The protocol has been left unspecified because it is only meaningful in
the context of a specific address family. If the address family had been set
to AF_INET or AF_INET6 then this field could have been set to
IPPROTO_TCP (but it is equally acceptable to leave it set to zero).
The AI_PASSIVE flag has not been set because the result is intended for
use as a remote address. Its absence causes the IP address to default to
the loopback address (as opposed to the wildcard address).
The AI_ADDRCONFIG flag has been set so that IPv6 results will only be
returned if the server has an IPv6 address, and similarly for IPv4.
The res argument is used to return a linked list of addrinfo structures containing the address or
addresses that were found. If multiple records are returned then the recommended behaviour
(from RFC 1123) is to try each address in turn, stopping when a connection is successfully
established. When doing this you may wish to limit the number of addresses tried and/or allow
connection attempts to overlap, in order to prevent the cumulative timeout period from becoming
excessive.
The memory occupied by the result list should be released by calling freeaddrinfo once it is no
longer needed, however this cannot be done until after the socket has been connected.

Create the client socket


The socket that will be used to establish the connection should be created using the
This takes three arguments:

socket

function.

1. the domain (AF_INET or AF_INET6 in this case, corresponding to IPv4 or


IPv6 respectively),

2. the socket type (SOCK_STREAM in this case, meaning that the socket
should provide reliable transport of an unstructured byte stream), and
3. the protocol (IPROTO_TCP in this case, corresponding to TCP).
A value of 0 for the protocol requests the default for the given address family and socket type,
which for AF_INET or AF_INET6 and SOCK_STREAMwould be IPPROTO_TCP. It is equally acceptable for
the protocol to be deduced in this manner or specified explicitly.
Assuming you previously used getaddrinfo to construct the remote address then the required
values can be obtained from the addrinfostructure:
int fd=socket(res->ai_family,res->ai_socktype,res->ai_protocol);
if (fd==-1) {
die("%s",strerror(errno));
}

When iterating through a list of addresses returned by getaddrinfo it is potentially necessary to


create a separate socket for each, because the addresses will not necessarily be members of the
same address family or use the same protocol.

Connect the socket to the remote address.


A connection is established by calling the connect function:
if (connect(fd,res->ai_addr,res->ai_addrlen)==-1) {
die("%s",strerror(errno));
}

The first argument is the socket descriptor. The second and third arguments are the remote socket
address and its length.
By default the connect function blocks until the initial TCP handshake has been completed and the
socket is ready for use, or alternatively, until the connection attempt fails. Some types of connection
failure are reported very quickly, whereas others can only be detected by means of a timeout. In the
latter case connect may block for several minutes.
If the remote address was constructed using getaddrinfo then the memory occupied by the address
list can now be released:
freeaddrinfo(res);

(If the address list has been searched or filtered then take care that it is the head of the list that is
released, not the address that you have chosen to use.)

The socket descriptor is now ready for use. Here is an example of how it might be utilised to
implement a Daytime Protocol client:
char buffer[256];
for (;;) {
ssize_t count=read(fd,buffer,sizeof(buffer));
if (count<0) {
if (errno!=EINTR) die("%s",strerror(errno));
} else if (count==0) {
break;
} else {
write(STDOUT_FILENO,buffer,count);
}
}
close(fd);

See also
Listen for and accept TCP connections in C
Send a UDP datagram in C
Send an arbitrary IPv4 datagram using a raw socket in C

Further Reading
Listen for and accept TCP connections in C, microHOWTO

Listen for and accept TCP connections in C


Content

1 Objective

2 Scenario

3 Method

3.1 Overview

3.2 Construct the local socket address

3.3 Create the server socket

3.4 Set the SO_REUSEADDR socket option

3.5 Bind the local address to the server socket

3.6 Listen for connections

3.7 Accept connections as they arrive

4 Variations

4.1 Determining the remote address

4.2 Constructing the local socket address without using getaddrinfo

5 See also

6 Further Reading

Tested on
Debian (Lenny)
Ubuntu (Trusty)

Objective
To listen for and accept inbound TCP connections in C

Scenario
Suppose that you wish to write a daemon that implements the TCP-based variant of the Daytime
Protocol, as defined by RFC 867
This is a very simple protocol whereby the server sends a human-readable copy of the current date
and time then closes the connection. Any data that the client might send is ignored.

Method
Overview
The method described here has six steps:
1. Construct the local socket address.
2. Create the server socket.
3. Set the

SO_REUSEADDR

socket option.

4. Bind the local address to the server socket.


5. Listen for inbound connections.
6. Accept connections as they arrive.

The following header files will be needed:


#include
#include
#include
#include
#include
#include

<errno.h>
<string.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>

Construct the local socket address


In order to listen for TCP connections it is necessary to choose a port number and, optionally, a
local IP address on which to listen. The combination of these two values is treated as a single entity
called the socket address, which is represented by a struct sockaddr_in for IPv4 or a struct
sockaddr_in6 for IPv6.
Most common network services have an assigned port number on which they are normally
expected to listen. While it makes sense to use this as the default, it is good practice to make the
port number configurable. Possible reasons for wanting to override the assigned port number
include:
running multiple instances of a network service on the same machine,
running a network service that would normally use a well-known port
number from a non-root account, or
making port scanning more time-consuming than it would be if the
standard port number were used.
The local IP address should normally default to either the the wildcard address or the loopback
address, but like the port number it is good practice to make it configurable. When a service is
bound to a particular IP address it will only accept connections directed to that address, whereas
when bound to the wildcard address it will accept connections to any local address. Binding to the
loopback address has the effect of prohibiting connections from other machines.
For most purposes the best way to construct the socket address is by calling getaddrinfo. This takes
a string containing the IP address and a string containing the port number, and converts them into
a sockaddr_in or a sockaddr_in6 as appropriate. It is also able to resolve hostnames and service
names:
const char* hostname=0; /* wildcard */
const char* portname="daytime";
struct addrinfo hints;
memset(&hints,0,sizeof(hints));
hints.ai_family=AF_UNSPEC;
hints.ai_socktype=SOCK_STREAM;
hints.ai_protocol=0;

hints.ai_flags=AI_PASSIVE|AI_ADDRCONFIG;
struct addrinfo* res=0;
int err=getaddrinfo(hostname,portname,&hints,&res);
if (err!=0) {
die("failed to resolve local socket address (err=%d)",err);
}

The hints argument contains additional information to help guide the conversion. In this example:
The address family has been left unspecified so that both IPv4 and IPv6
addresses can be returned. In principle you could receive results for other
address families too: you can either treat this as a feature, or filter out
any unwanted results after the call to getaddrinfo.
The socket type has been constrained to
excludes UDP.

SOCK_STREAM.

This allows TCP but

The protocol has been left unspecified because it is only meaningful in


the context of a specific address family. If the address family had been set
to AF_INET or AF_INET6 then this field could have been set to IPPROTO_TCP (but it
is equally acceptable to leave it set to zero).
The AI_PASSIVE flag has been set because the address is intended for
binding to a server socket. It causes the IP address to default to the
wildcard address as opposed to the loopback address.
The AI_ADDRCONFIG flag has been set so that IPv6 results will only be
returned if the server has an IPv6 address, and similarly for IPv4.
The res argument is used to return a linked list of addrinfo structures containing the address or
addresses that were found. If the network service daemon has the ability to listen on multiple
sockets then it should open one for each address in the list. Otherwise it is considered acceptable to
use the first result and discard the remainder.
The memory occupied by the result list should be released by calling freeaddrinfo once it is no
longer needed, however this cannot be done until after the socket has been created and bound.

Create the server socket


The socket that will be used to listen for connections should be created using the
This takes three arguments:
1. the domain (AF_INET or
respectively),

AF_INET6

socket

function.

in this case, corresponding to IPv4 or IPv6

2. the socket type (SOCK_STREAM in this case, meaning that the socket should
provide reliable transport of an unstructured byte stream), and

3. the protocol (IPROTO_TCP in this case, corresponding to TCP).


A value of 0 for the protocol requests the default for the given address family and socket type,
which for AF_INET or AF_INET6 and SOCK_STREAMwould be IPPROTO_TCP. It is equally acceptable for
the protocol to be deduced in this manner or specified explicitly.
Assuming you previously used getaddrinfo to construct the local address then the required values
can be obtained from the addrinfo structure:
int server_fd=socket(res->ai_family,res->ai_socktype,res->ai_protocol);
if (server_fd==-1) {
die("%s",strerror(errno));
}

Set the SO_REUSEADDR socket option


should be routinely set for TCP server sockets in order to allow the network service to
be restarted when there are connections in the ESTABLISHED or TIME-WAIT state:
SO_REUSEADDR

int reuseaddr=1;
if (setsockopt(server_fd,SOL_SOCKET,SO_REUSEADDR,&reuseaddr,sizeof(reuseaddr))==-1) {
die("%s",strerror(errno));
}

See Listen on a TCP port with connections in the TIME-WAIT state for a detailed discussion of
this issue.

Bind the local address to the server socket


As noted previously, the server socket must be bound to a local address before it can listen for
connections. This should be done using the bindfunction:
if (bind(server_fd,res->ai_addr,res->ai_addrlen)==-1) {
die("%s",strerror(errno));
}

The first argument is the socket descriptor. The second and third arguments are the local address
and its length.
If the local address was constructed using getaddrinfo then the memory occupied by the address
list can now be released:
freeaddrinfo(res);

(If the address list has been searched or filtered then take care that it is the head of the list that is
released, not the address that you have chosen to use.)

Listen for connections


The server socket can now be instructed to listen for connections. This should be done using
the listen function:
if (listen(server_fd,SOMAXCONN)) {
die("failed to listen for connections (errno=%d)",errno);
}

The first argument is the socket descriptor. The second argument is the backlog of outstanding
connections that the operating system should queue while they are waiting to be accepted by the
server process. It is only a hint: most implementations take some account of the value requested,
but you should not make any assumptions. A value of SOMAXCONN indicates that the maximum
permissible queue length should be selected.
The optimum value for the backlog depends on the nature of the load:
If the value is too low then the server will be poor at handling short-term
bursts of activity. Connections may be rejected even if the average load is
well below what the server can handle.
If the value is too high then the server will perform less well when it is
genuinely overloaded. Under those circumstances, lengthening the queue
merely increases latency without improving capacity.
A backlog of 5 is a popular choice due to its use in many tutorials. For services that receive
connections at a very slow rate this is probably adequate, but it is too low for services that handle
many short-lived connections (such as web servers). In that case the author's advice would be to
make the value configurable, with a default of SOMAXCONN.

Accept connections as they arrive


Connections are accepted by the server process by repeatedly calling the accept function. Each
time this is done a new socket descriptor is returned to act as an endpoint for the newly established
connection. If no connections are available then the function blocks.
The process of handling a connection should preferably not interfere with the acceptance or
handling of other connections. One way to ensure this is to spawn a new child process for each
connection:
for (;;) {
int session_fd=accept(server_fd,0,0);
if (session_fd==-1) {
if (errno==EINTR) continue;
die("failed to accept connection (errno=%d)",errno);
}
pid_t pid=fork();

if (pid==-1) {
die("failed to create child process (errno=%d)",errno);
} else if (pid==0) {
close(server_fd);
handle_session(session_fd);
close(session_fd);
_exit(0);
} else {
close(session_fd);
}
}

The parent process should close the descriptor for each connected socket once the corresponding
child process has been spawned. There are two reasons for doing this: to prevent the descriptors
from accumulating, and to prevent the connection from being held open by the parent after it has
been closed by the child. Similarly, the child process should close any file or socket descriptors
inherited from the parent that it does not need access to. This will certainly include the descriptor
for the server socket, but you should consider whether there are any others.
Functionality that is specific to the network service is represented here by the
function handle_session. As a simple example, here is an implementation of the Daytime Protocol:
void handle_session(int session_fd) {
time_t now=time(0);
char buffer[80];
size_t length=strftime(buffer,sizeof(buffer),"%a %b %d %T %Y\r\n",localtime(&now));
if (length==0) {
snprintf(buffer,sizeof(buffer),"Error: buffer overflow\r\n");
}
size_t index=0;
while (index<length) {
ssize_t count=write(session_fd,buffer+index,length-index);
if (count<0) {
if (errno==EINTR) continue;
die("failed to write to socket (errno=%d)",errno);
} else {
index+=count;
}
}
}

Variations
Determining the remote address
It is often desirable and sometimes necessary to determine the remote address from which an
inbound connection originated. A common reason for wanting to do this is to keep an log of all
connections. Other possible motivations include access control, or establishing an outbound
connection back to the client.
The address can be obtained at the time when the connection is accepted by supplying a buffer to
place it in. Alternatively, it can be obtained at any time while the connection is open by
calling getpeername.

The supplied buffer must be large enough and sufficiently well-aligned to accept any socket address
that might be returned. If the address family has not been hard-coded then you can use the
type struct sockaddr_storage, which is designed to hold addresses of any type:
struct sockaddr_storage sa;
socklen_t sa_len=sizeof(sa);
int session_fd=accept(server_fd,(struct sockaddr*)&sa,&sa_len);

Alternatively, if the local address was constructed using getaddrinfo then the required size in bytes
can be found in the ai_addrlen member of the relevant addrinfo structure.
If there is a need to convert the address to human-readable form then this is best done using
the getnameinfo function, especially if it is not known whether the address family is IPv4 or IPv6:
char buffer[INET6_ADDRSTRLEN];
int err=getnameinfo((struct sockaddr*)&sa,sa_len,buffer,sizeof(buffer),0,0,NI_NUMERICHOST);
if (err!=0) {
snprintf(buffer,sizeof(buffer),"invalid address");
}

A useful refinement is to convert IPv4-mapped addresses into plain IPv4 addresses prior to
calling getnameinfo:
if (sa.ss_family==AF_INET6) {
struct sockaddr_in6* sa6=(struct sockaddr_in6*)&sa;
if (IN6_IS_ADDR_V4MAPPED(&sa6->sin6_addr)) {
struct sockaddr_in sa4;
memset(&sa4,0,sizeof(sa4));
sa4.sin_family=AF_INET;
sa4.sin_port=sa6->sin6_port;
memcpy(&sa4.sin_addr.s_addr,sa6->sin6_addr.s6_addr+12,4);
memcpy(&sa,&sa4,sizeof(sa4));
sa_len=sizeof(sa4);
}
}

For example, if a IPv4 connection from 192.168.0.1 were received using an IPv6 socket then the
code fragment above would cause the address to be presented as 192.168.0.1 instead of the less
readable ::ffff:192.168.0.1.

Constructing the local socket address without using getaddrinfo


There are some circumstances where getaddrinfo is not he best way to construct the local socket
address. For example, you may already have the port number and IP address in numeric form, or
you may need to be compatible with older systems on which getaddrinfo is not available. A
solution in these cases is to construct the socket address explicitly.
An IPv4 socket address is represented by a struct sockaddr_in. It should be zeroed before use, and
any information within it should be stored in network byte order. For example, to create a socket
address with a port number of 13 and the wildcard IP address:

struct sockaddr_in addr;


memset(&addr,0,sizeof(addr));
addr.sin_family=AF_INET;
addr.sin_port=htons(13);
addr.sin_addr.s_addr=htonl(INADDR_ANY);

Similarly for IPv6:


struct sockaddr_in6 addr;
memset(&addr,0,sizeof(addr));
addr.sin6_family=AF_INET6;
addr.sin6_flowinfo=0;
addr.sin6_port=htons(13);
addr.sin6_addr=in6addr_any;

See also
Listen on a TCP port with connections in the TIME-WAIT state
Establish a TCP connection in C
Listen for and receive UDP datagrams in C

Further Reading
Listen on a TCP port with connections in the TIME-WAIT state
Convert an IP address to a human-readable string in C

Listen on a TCP port with connections in the TIME-WAIT state


Content

1 Objective

2 Background

3 Scenario

4 Method

5 Notes

6 Methods to avoid

6.1 Using SO_LINGER

Tested on
Debian (Lenny, Precise)

Objective
To begin listening on a TCP port whilst there are one or more connections to that port in the TIMEWAIT state, without waiting for the TIME-WAIT state to expire.

Background
When a TCP connection is closed then the socket from which the closure was initiated is not
destroyed immediately. Instead it is placed in the TIME-WAIT state, where it is required to remain
for at least twice the maximum segment lifetime (MSL) to allow any stray network packets to
dissipate. During this period it is not permissible for another TCP connection to be established
between the same pair of IP addresses and port numbers.
By itself this would be no great burden, but most implementations go further and (by default) do
not allow a local address to be bound to a socket if there are any existing sockets using the same IP
address and port number (including sockets in the TIME-WAIT state).
The practical effect of this behaviour is that when a network service terminates leaving connections
in the TIME-WAIT state, it may not be possible to restart that service until the TIME-WAIT states
have expired. The error reported when this happens is EADDRINUSE, which glibc renders as Address
already in use.
Note that TIME-WAIT is not the only issue that could result in an EADDRINUSE error. For example,
there could be orphaned child processes that were spawned by the network service but are still
handling connections. Alternatively there could be another process listening to the port, perhaps
because the previously running instance of the network service failed to die. You can check for
these conditions by running the netstatcommand, without the -l option for connected sockets:
netstat -tn

and with the -l option for listening sockets:


netstat -tln

The maximum segment lifetime is implementation-dependent, but is typically in the range 30


seconds to 2 minutes. The minimum lifetime of the TIME-WAIT state is therefore typically in the
range 1 to 4 minutes.

Scenario
Suppose you are writing a daemon that provides a TCP-based network service. Currently the
following sequence of operations is used to open a server socket and listen on the required port:
int fd=socket(AF_INET,SOCK_STREAM,0);
if (fd==-1) {
die("%s",strerror(errno));
}
if (bind(fd,(struct sockaddr*)&addr,sizeof(addr))==-1) {
die("%s",strerror(errno));
}
if (listen(fd,SOMAXCONN)==-1) {
die("%s",strerror(errno));
}

When the network service is restarted it sometimes fails with the error Address already in use.
You wish to prevent this from happening.

Method
The error can be avoided by setting the SO_REUSEADDR socket option after the socket has been
created but before calling bind:
int fd=socket(AF_INET,SOCK_STREAM,0);
if (fd==-1) {
die("%s",strerror(errno));
}
int reuseaddr=1;
if (setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,&reuseaddr,sizeof(reuseaddr))==-1) {
die("%s",strerror(errno));
}
if (bind(fd,(struct sockaddr*)&addr,sizeof(addr))==-1) {
die("%s",strerror(errno));
}
if (listen(fd,SOMAXCONN)==-1) {
die("%s",strerror(errno));
}

allows a local address to be bound to a socket even if that address is already being
used by a connection. This is helpful not only for dealing with connections in the TIME-WAIT
state, but also any ESTABLISHED connections that are being handled by orphaned child processes.
SO_REUSEADDR

It is considered safe for a TCP server socket to reuse a local address, because such sockets are used
only to listen for connections and do not themselves act as endpoints. When new connections arrive
they will need to be checked to ensure that they do not clash with existing ones, but this is
something the network stack should be doing anyway: it makes no difference that the server
process has been restarted.
In the absence of any good reason for leaving SO_REUSEADDR unset, it is considered good practice to
set it as a matter of routine when creating TCP server sockets.

Notes
Depending on the implementation, it may be necessary for SO_REUSEADDR to be set both before and
after the service is restarted.
does not allow two TCP sockets to listen to the same IP address and port number at
the same time.
SO_REUSEADDR

Methods to avoid
Using SO_LINGER
It is possible to prevent the TIME-WAIT state from being entered in the first place by setting
theSO_LINGER option with a timeout of zero. This changes the behaviour of the close function:
instead of performing a graceful shutdown, it aborts the connection by sending an immediate RST.
Any unsent data is discarded and the socket immediately reverts to the CLOSED state.
Whilst this would meet the objective as stated, it is not a desirable solution because it circumvents
the protection against stray network packets provided by the TIME-WAIT state.
Since SO_REUSEADDR achieves the desired effect more safely, there is no justification for
using SO_LINGER to avoid EADDRINUSE errors.

Convert an IP address to a human-readable string in C


Content

1 Objective

2 Scenario

3 Method

4 Variations

4.1 Converting IPv4-mapped IPv6 addresses to plain IPv4


5 Alternatives

5.1 Using inet_ntop

5.2 Using inet_ntoa

Tested on
Debian (Lenny)
Ubuntu (Precise, Trusty)

Objective
To convert an IPv4 or IPv6 address to a human-readable string (for
example 192.168.0.1 or 2001:db8::1)

Scenario
Suppose you have used the getpeername function to obtain the remote address to which a particular
TCP socket is connected:
struct sockaddr_storage addr;
socklen_t addr_len=sizeof(addr);
int err=getpeername(sock_fd,(struct sockaddr*)&addr,&addr_len);
if (err!=0) {
die("failed to fetch remote address (errno=%d)",errno);
}

The remote address has been written to a buffer called addr. This buffer is of type struct
sockaddr_storage, but the address stored within it will be of type struct
sockaddr_in or sockaddr_in6. The length of the address has been recorded in the variable addr_len.
Note that:

addr

addr_len

is a socket address, so in addition to the IP address it contains


information such as the address family and port number.
will probably not be equal to
call to getpeername has completed.

sizeof(struct sockaddr_storage)

once the

You wish to convert the IP address contained within addr to a human-readable string.

Method
One way to perform the required conversion is to call the getnameinfo function. By default this
attempts to convert the address into a domain name, however it can be instructed to produce a
numeric address instead by setting the NI_NUMERICHOST flag:
#include <netdb.h>
#include <sys/socket.h>
#include <netinet/in.h>
// ...
char buffer[INET6_ADDRSTRLEN];
int err=getnameinfo((struct sockaddr*)&addr,addr_len,buffer,sizeof(buffer),
0,0,NI_NUMERICHOST);
if (err!=0) {
die("failed to convert address to string (code=%d)",err);
}
printf("Remote address: %s\n",buffer);

The string buffer needs to be at least INET_ADDRSTRLEN bytes long for IPv4 and INET6_ADDRSTRLEN for
IPv6. Since these constants are fixed (by POSIX) at 16 and 46 bytes
respectively, INET6_ADDRSTRLEN can be presumed to suffice for either address family.

Variations
Converting IPv4-mapped IPv6 addresses to plain IPv4
If an IPv4 connection is made to an IPv6 socket then the local and remote network addresses will
be represented as IPv4-mapped addresses. For example, the IPv4 address 192.168.0.1 would be
represented by the IPv6 address ::ffff:192.168.0.1.
This format is readable, but it is probably not the best choice for presentation to the user. Since the
connection was made using IPv4, the user could reasonably expect to see an IPv4 address. This can
be achieved by converting the address from IPv6 to IPv4 before calling getnameinfo:
if (addr.ss_family==AF_INET6) {
struct sockaddr_in6* addr6=(struct sockaddr_in6*)&addr;
if (IN6_IS_ADDR_V4MAPPED(&addr6->sin6_addr)) {
struct sockaddr_in addr4;
memset(&addr4,0,sizeof(addr4));
addr4.sin_family=AF_INET;
addr4.sin_port=addr6->sin6_port;
memcpy(&addr4.sin_addr.s_addr,addr6->sin6_addr.s6_addr+12,sizeof(addr4.sin_addr.s_addr));
memcpy(&addr,&addr4,sizeof(addr4));
addr_len=sizeof(addr4);
}
}

The conversion is performed only if the address family is IPv6, and then only if the address if IPv4mapped. The address buffer must be writable, and of the appropriate size and alignment to hold an
IPv4 or IPv6 socket address. (That is the case here because the buffer is of type struct
sockaddr_storage).

Alternatives
Using inet_ntop
An alternative method is to use the function inet_ntop. This is somewhat easier to use
than getnameinfo if the IP address is not already embedded within a socket address, for example:
#include <arpa/inet.h>
// ...
char buffer[INET4_ADDRSTRLEN];
const char* result=inet_ntop(AF_INET,&ipv4addr,buffer,sizeof(buffer));
if (result==0) {
die("failed to convert address to string (errno=%d)",errno);
}

IPv6 addresses can be handled by specifying AF_INET6 as the first argument, but
(unlike getnameinfo) the result will not include the scope of a link-local or site-local address.

For both IPv4 and IPv6 the address passed in must be in network byte order (most significant byte
first).

Using inet_ntoa
Another alternative is to use the function inet_ntoa. As with inet_ntop, the given IP address need
not be embedded within a socket address:
#include <arpa/inet.h>
// ...
const char* result=inet_ntoa(&ipv4addr);

Notable disadvantages of inet_ntoa are that it is not thread safe and provides no support for IPv6.
However it does pre-date both getnameinfoand inet_ntop, so is more likely to be available on older
systems.

Ifconfig: 10 Examples To Configure Network Interface


by RAMESH NATARAJAN on MARCH 9, 2009

This article is written by Lakshmanan G


Ifconfig command is used to configure network interfaces. ifconfig stands for interface configurator.
Ifconfig is widely used to initialize the network interface and to enable or disable the interfaces.
In this article, let us review 7 common usages of ifconfig command.

1. View Network Settings of an Ethernet Adapter


Ifconfig, when invoked with no arguments will display all the details of currently active interfaces. If
you give the interface name as an argument, the details of that specific interface will be displayed.

# ifconfig eth0

eth0 Link encap:Ethernet HWaddr 00:2D:32:3E:39:3B

inet addr:192.168.2.2 Bcast:192.168.2.255 Mask:255.255.255.0

inet6 addr: fe80::21d:92ff:fede:499b/64 Scope:Link

UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1

RX packets:977839669 errors:0 dropped:1990 overruns:0 frame:0

TX packets:1116825094 errors:8 dropped:0 overruns:0 carrier:0

collisions:0 txqueuelen:1000

RX bytes:2694625909 (2.5 GiB) TX bytes:4106931617 (3.8 GiB)

Interrupt:185 Base address:0xdc00

2. Display Details of All interfaces Including Disabled Interfaces


# ifconfig -a

3. Disable an Interface
# ifconfig eth0 down

4. Enable an Interface
# ifconfig eth0 up

Or # ifup eth0

5. Assign ip-address to an Interface


Assign 192.168.2.2 as the IP address for the interface eth0.

# ifconfig eth0 192.168.2.2

Change Subnet mask of the interface eth0.

# ifconfig eth0 netmask 255.255.255.0

Change Broadcast address of the interface eth0.

# ifconfig eth0 broadcast 192.168.2.255

Assign ip-address, netmask and broadcast at the same time to interface eht0.

# ifconfig eth0 192.168.2.2 netmask 255.255.255.0 broadcast 192.168.2.255

6. Change MTU
This will change the Maximum transmission unit (MTU) to XX. MTU is the maximum number of
octets the interface is able to handle in one transaction. For Ethernet the Maximum transmission unit
by default is 1500.

# ifconfig eth0 mtu XX

7. Promiscuous mode
By default when a network card receives a packet, it checks whether the packet belongs to itself. If not,
the interface card normally drops the packet. But in promiscuous mode, the card doesnt drop the
packet. Instead, it will accept all the packets which flows through the network card.

Superuser privilege is required to set an interface in promiscuous mode. Most network monitor tools
use the promiscuous mode to capture the packets and to analyze the network traffic.

Following will put the interface in promiscuous mode.

# ifconfig eth0 promisc

Following will put the interface in normal mode.

# ifconfig eth0 -promisc

8 How to Add New Alias to Network Interface


The ifconfig utility allows you to configure additional network interfaces
using alias feature. To add alias network interface of eth0, use the following command.
Please note that alias network address in same sub-net mask. For example, if
your eth0 network ip address is 172.16.25.125, then alias ip address must
be172.16.25.127.

[root@tecmint~]#ifconfigeth0:0172.16.25.127

Next, verify the newly created alias network interface address, by using ifconfig eth0:0
command.

[root@tecmint~]#ifconfigeth0:0

eth0:0Linkencap:EthernetHWaddr00:01:6C:99:14:68

inetaddr:172.16.25.123Bcast:172.16.25.63Mask:255.255.255.240

UPBROADCASTRUNNINGMULTICASTMTU:1500Metric:1

Interrupt:17

9 How to Remove Alias to Network Interface


If you no longer required an alias network interface or you incorrectly configured it, you
can remove it by using the following command.

[root@tecmint~]#ifconfigeth0:0down

10 How to Change the MAC address of Network Interface


To change the MAC (Media Access Control) address of an eth0 network interface, use
the following command with argument hw ether. For example, see below.

[root@tecmint~]#ifconfigeth0hwetherAA:BB:CC:DD:EE:FF

These are the most useful commands for configuring network interfaces in Linux, for
more information and usage of ifconfig command use the manpages like man ifconfig
at the terminal. Check out some other networking utilities below.

Configure network card in promiscuous mode


When running in promiscuous mode, all traffic the network card receives can be read. This configuration is useful for us to do network
monitoring, like for a network intrusion detection system.
How can I config my network card in promiscuous mode?

You can do this easily by one command. It works on both RedHat and Debian based distributions. Below is an example:
root@db1:~# ifconfig eth1 promisc

[2685638.719679] device eth1 entered promiscuous mode

root@db1:~# ifconfig eth1 -promisc

root@db1:~# dmesg | tail -1

[2685655.668037] device eth1 left promiscuous mode


Then, how can we setup the promiscuous mode in configuration files, so that it takes effect when system boots? As the
configuration varies by distribution, here we raise two examples.

Setup promiscuous mode on Redhat / CentOS


To configure a network card in promiscuous mode, you need to put the line PROMISC=yes in its configuration
file/etc/sysconfig/network-scripts/ifcfg-ethX.
BOOTPROTO=static

DEVICE=ethX

ONBOOT=yes

TYPE=Ethernet

PROMISC=yes

USERCTL=no
Dont forget to replace ethX to the right device you are using.

Setup promiscuous mode on Ubuntu / Debian


Below is part of an example file of /etc/network/interface:
auto eth0

iface eth0 inet manual

up ifconfig $IFACE 192.168.1.100 up

up ip link set $IFACE promisc on

down ip link set $IFACE promisc off

down ifconfig $IFACE down

TCPDUMP INFO
When it comes to tcpdump most admins fall into two categories; they either
know tcpdump and all of its flags like the back of their hand, or they kind of know it but
need to use a reference for anything outside of the basic usage. The reason for this is
because tcpdump is a pretty advanced command and it is pretty easy to get into the
depths of how networking works when using it.
For today's article I wanted to create a quick but practical reference for tcpdump. I will
cover the basics as well as some of the more advanced usage. I am sure I will most likely
leave out some cool commands so if you want to add anything please feel free to drop it
into the comments section.
Before we get too far into the weeds, it is probably best to cover what tcpdump is used
for. The commandtcpdump is used to create "dumps" or "traces" of network traffic. It
allows you to look at what is happening on the network and really can be useful for
troubleshooting many types of issues including issues that aren't due to network
communications. Outside of network issues I use tcpdump to troubleshoot application
issues all the time; if you ever have two applications that don't seem to be working well
together, tcpdump is a great way to see what is happening. This is especially true if the
traffic is not encrypted as tcpdump can be used to capture and read packet data as well.

The Basics
The first thing to cover with tcpdump is what flags to use. In this section I am going to
cover the most basic flags that can be used in most situations.

Don't translate hostnames, ports, etc


# tcpdump -n

By default tcpdump will try to lookup and translate hostnames and ports.
# tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:15:05.051896 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2546456553:2546456749, ack 1824683693,
win 355, options [nop,nop,TS val 620879437 ecr 620879348], length 196

You can turn this off by using the -n flag. Personally, I always use this flag as the
hostname and port translation usually annoys me because I tend to work from IP
addresses rather than hostnames. However, knowing that you can
have tcpdump translate or not translate these are useful; as there are times where
knowing what server the source traffic is coming from is important.
# tcpdump -n
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:23:47.934665 IP 10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], seq 2546457621:2546457817, ack
1824684201, win 355, options [nop,nop,TS val 621010158 ecr 621010055], length 196

Adding verbosity
# tcpdump -v

By adding a simple -v the output will start including a bit more such as the ttl, total length
and options in an the IP packets.
# tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:15:05.051896 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2546456553:2546456749, ack 1824683693,
win 355, options [nop,nop,TS val 620879437 ecr 620879348], length 196

tcpdump has three verbosity levels, you can add more verbosity by adding additional v's
to the command line flags. In general whenever I am using tcpdump I tend to use the

highest verbosity, as I like having everything visible just in case I need it.
# tcpdump -vvv -c 1
tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:36:13.873456 IP (tos 0x10, ttl 64, id 121, offset 0, flags [DF], proto TCP (6), length 184)
blog.ssh > 10.0.3.1.32855: Flags [P.], cksum 0x1ba1 (incorrect -> 0x0dfd), seq
2546458841:2546458973, ack 1824684869, win 355, options [nop,nop,TS val 621196643 ecr 621196379],
length 132

Specifying an Interface
# tcpdump -i eth0

By default when you run tcpdump without specifying an interface it will choose the
lowest numbered interface, usually this is eth0 however that is not guaranteed for all
systems.
# tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:15:05.051896 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2546456553:2546456749, ack 1824683693,
win 355, options [nop,nop,TS val 620879437 ecr 620879348], length 196

You can specify the interface by using the -i flag followed by the interface name. On most
linux systems a special interface name of any can be used to tell tcpdump to listen on
all interfaces, I find this extremely useful when troubleshooting servers with multiple
interfaces. This is especially true when there are routing issues involved.
# tcpdump -i any
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
16:45:59.312046 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2547763641:2547763837, ack 1824693949,
win 355, options [nop,nop,TS val 621343002 ecr 621342962], length 196

Writing to a file
# tcpdump -w /path/to/file

When you just run tcpdump by itself it will output to your screen.
# tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:15:05.051896 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2546456553:2546456749, ack 1824683693,
win 355, options [nop,nop,TS val 620879437 ecr 620879348], length 196

There are many times where you may want to save the tcpdump data to a file, the easiest
way to do this is to use the -w flag. This is useful for situations where you may need to
save the network dump to review later. One benefit to saving the data to a file is that you
can read the dump file multiple times and apply other flags or filters (which we will cover
below) to that snapshot of network traffic.
# tcpdump -w /var/tmp/tcpdata.pcap
tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
1 packet captured
2 packets received by filter
0 packets dropped by kernel

By default the data is buffered and will not usually be written to the file until
you CTRL+C out of the runningtcpdump command.

Reading from a file


# tcpdump -r /path/to/file

Once you save the output to a file you will inherently need to read that file. To do this
you can simply use the -r flag followed by the path to the file.
# tcpdump -r /var/tmp/tcpdata.pcap
reading from file /var/tmp/tcpdata.pcap, link-type EN10MB (Ethernet)
16:56:01.610473 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2547766673:2547766805, ack 1824696181,
win 355, options [nop,nop,TS val 621493577 ecr 621493478], length 132

As a quick note, if you are more familiar with tools such as wireshark you can read files
saved by tcpdump with most network troubleshooting tools like wireshark.

Specifying the capture size of each packet


# tcpdump -s 100

By default most newer implementations of tcpdump will capture 65535 bytes, however
in some situations you may not want to capture the default packet length. You can use s to specify the "snaplen" or "snapshot length" that you want tcpdump to capture.

Specifying the number of packets to capture


# tcpdump -c 10

When you run tcpdump by itself it will keep running until you hit CTRL+C to quit.
# tcpdump host google.com
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
^C
0 packets captured
4 packets received by filter
0 packets dropped by kernel

You can tell tcpdump to stop capturing after a certain number of packets by using the c flag followed by the number of packets to capture. This is pretty useful for situations
where you may not want tcpdump to spew output to your screen so fast you can't read it,
however generally this is more useful when you are using filters to grab specific traffic.

Pulling the basics together


# tcpdump -nvvv -i any -c 100 -s 100

All of the basic flags that were covered above can also be combined to allow you to
specify exactly what you want tcpdump to provide.

# tcpdump -w /var/tmp/tcpdata.pcap -i any -c 10 -vvv


tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
10 packets captured
10 packets received by filter
0 packets dropped by kernel
# tcpdump -r /var/tmp/tcpdata.pcap -nvvv -c 5
reading from file /var/tmp/tcpdata.pcap, link-type LINUX_SLL (Linux cooked)
17:35:14.465902 IP (tos 0x10, ttl 64, id 5436, offset 0, flags [DF], proto TCP (6), length 104)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1b51 (incorrect -> 0x72bc), seq
2547781277:2547781329, ack 1824703573, win 355, options [nop,nop,TS val 622081791 ecr 622081775],
length 52
17:35:14.466007 IP (tos 0x10, ttl 64, id 52193, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x4950), seq 1, ack 52, win 541,
options [nop,nop,TS val 622081791 ecr 622081791], length 0
17:35:14.470239 IP (tos 0x10, ttl 64, id 5437, offset 0, flags [DF], proto TCP (6), length 168)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1b91 (incorrect -> 0x98c3), seq 52:168, ack 1, win
355, options [nop,nop,TS val 622081792 ecr 622081791], length 116
17:35:14.470370 IP (tos 0x10, ttl 64, id 52194, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x48da), seq 1, ack 168, win 541,
options [nop,nop,TS val 622081792 ecr 622081792], length 0
17:35:15.464575 IP (tos 0x10, ttl 64, id 5438, offset 0, flags [DF], proto TCP (6), length 104)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1b51 (incorrect -> 0xc3ba), seq 168:220, ack 1,
win 355, options [nop,nop,TS val 622082040 ecr 622081792], length 52

Filters
Now that we have covered some of the basic flags we should cover
filtering. tcpdump has the ability to filter the capture or output based on a variety of
expressions, in this article I am only going to cover a few quick examples to give you an
idea of the syntax. For a full list you can checkout the pcap-filter section of
the tcpdumpmanpage.

Searching for traffic to and from a specific host


# tcpdump -nvvv -i any -c 3 host 10.0.3.1

The above command will run a tcpdump and send the output to the screen like we saw
with the flags before, however it will only do so if the source or destination IP address

is 10.0.3.1. Essentially by adding host 10.0.3.1 we are asking tcpdump to filter out
anything that is not to or from 10.0.3.1.
# tcpdump -nvvv -i any -c 3 host 10.0.3.1
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
17:54:15.067496 IP (tos 0x10, ttl 64, id 5502, offset 0, flags [DF], proto TCP (6), length 184)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1ba1 (incorrect -> 0x9f75), seq
2547785621:2547785753, ack 1824705637, win 355, options [nop,nop,TS val 622366941 ecr 622366923],
length 132
17:54:15.067613 IP (tos 0x10, ttl 64, id 52315, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x7c34), seq 1, ack 132, win 540,
options [nop,nop,TS val 622366941 ecr 622366941], length 0
17:54:15.075230 IP (tos 0x10, ttl 64, id 5503, offset 0, flags [DF], proto TCP (6), length 648)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1d71 (incorrect -> 0x3443), seq 132:728, ack 1,
win 355, options [nop,nop,TS val 622366943 ecr 622366941], length 596

Only show traffic where the source is a specific host


# tcpdump -nvvv -i any -c 3 src host 10.0.3.1

Where the previous example showed traffic to and from 10.0.3.1 the above command
will only show traffic where the source of the packet is 10.0.3.1. This is accomplished
by adding src in front of the host filter. This is an additional filter that tells tcpdump to
look for a specific "source". This can be reversed by using the dstfilter, which specifies
the "destination".
# tcpdump -nvvv -i any -c 3 src host 10.0.3.1
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
17:57:12.194902 IP (tos 0x10, ttl 64, id 52357, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x1707), seq 1824706545, ack
2547787717, win 540, options [nop,nop,TS val 622411223 ecr 622411223], length 0
17:57:12.196288 IP (tos 0x10, ttl 64, id 52358, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x15c5), seq 0, ack 325, win 538,
options [nop,nop,TS val 622411223 ecr 622411223], length 0
17:57:12.197677 IP (tos 0x10, ttl 64, id 52359, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x1491), seq 0, ack 633, win
536, options [nop,nop,TS val 622411224 ecr 622411224], length 0
# tcpdump -nvvv -i any -c 3 dst host 10.0.3.1
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
17:59:37.266838 IP (tos 0x10, ttl 64, id 5552, offset 0, flags [DF], proto TCP (6), length 184)

10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1ba1 (incorrect -> 0x586d), seq
2547789725:2547789857, ack 1824707577, win 355, options [nop,nop,TS val 622447491 ecr 622447471],
length 132
17:59:37.267850 IP (tos 0x10, ttl 64, id 5553, offset 0, flags [DF], proto TCP (6), length 392)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1c71 (incorrect -> 0x462e), seq 132:472, ack 1,
win 355, options [nop,nop,TS val 622447491 ecr 622447491], length 340
17:59:37.268606 IP (tos 0x10, ttl 64, id 5554, offset 0, flags [DF], proto TCP (6), length 360)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1c51 (incorrect -> 0xf469), seq 472:780, ack 1, win
355, options [nop,nop,TS val 622447491 ecr 622447491], length 308

Filtering source and destination ports


# tcpdump -nvvv -i any -c 3 port 22 and port 60738

You can add some rather complicated filtering statements with tcpdump when you start
to using operators likeand. You can think of this as something similar to if statements. In
this example we are using the and operator to tell tcpdump to only output packets that
have both ports 22 and 60738. This allows us to narrow down the packets to a specific
session, this can be extremely useful when troubleshooting network issues.
# tcpdump -nvvv -i any -c 3 port 22 and port 60738
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:05:54.069403 IP (tos 0x10, ttl 64, id 64401, offset 0, flags [DF], proto TCP (6), length 104)
10.0.3.1.60738 > 10.0.3.246.22: Flags [P.], cksum 0x1b51 (incorrect -> 0x5b3c), seq
917414532:917414584, ack 1550997318, win 353, options [nop,nop,TS val 622541691 ecr 622538903],
length 52
18:05:54.072963 IP (tos 0x10, ttl 64, id 13601, offset 0, flags [DF], proto TCP (6), length 184)
10.0.3.246.22 > 10.0.3.1.60738: Flags [P.], cksum 0x1ba1 (incorrect -> 0xb0b1), seq 1:133, ack 52, win
355, options [nop,nop,TS val 622541692 ecr 622541691], length 132
18:05:54.073080 IP (tos 0x10, ttl 64, id 64402, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.60738 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x1e3b), seq 52, ack 133, win
353, options [nop,nop,TS val 622541692 ecr 622541692], length 0

You can express the and operator in a couple of different ways, you can use and or &&.
Personally, I tend to use them both; it is important to remember that if you are going to
use && that you should enclose the filter expression with single or double quotes. In
BASH you can use && to run one command and if successful run a second. In general it
is best to simply wrap filter expressions in quotes; this will prevent any unexpected
results as filters can have quite a few special characters.

# tcpdump -nvvv -i any -c 3 'port 22 && port 60738'


tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:06:16.062818 IP (tos 0x10, ttl 64, id 64405, offset 0, flags [DF], proto TCP (6), length 88)
10.0.3.1.60738 > 10.0.3.246.22: Flags [P.], cksum 0x1b41 (incorrect -> 0x776c), seq
917414636:917414672, ack 1550997518, win 353, options [nop,nop,TS val 622547190 ecr 622541776],
length 36
18:06:16.065567 IP (tos 0x10, ttl 64, id 13603, offset 0, flags [DF], proto TCP (6), length 120)
10.0.3.246.22 > 10.0.3.1.60738: Flags [P.], cksum 0x1b61 (incorrect -> 0xaf2d), seq 1:69, ack 36, win
355, options [nop,nop,TS val 622547191 ecr 622547190], length 68
18:06:16.065696 IP (tos 0x10, ttl 64, id 64406, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.60738 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0xf264), seq 36, ack 69, win 353,
options [nop,nop,TS val 622547191 ecr 622547191], length 0

Searching for traffic on one port or another


# tcpdump -nvvv -i any -c 20 'port 80 or port 443'

You can also use the or or || operator to filter tcpdump results. In this example we are
using the or operator to capture traffic to and from port 80 or port 443. This example is
especially useful as webservers generally have two ports open, 80 for http traffic
and 443 for https.
# tcpdump -nvvv -i any -c 20 'port 80 or port 443'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:24:28.817940 IP (tos 0x0, ttl 64, id 39930, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.1.50524 > 10.0.3.246.443: Flags [S], cksum 0x1b25 (incorrect -> 0x8611), seq 3836995553, win
29200, options [mss 1460,sackOK,TS val 622820379 ecr 0,nop,wscale 7], length 0
18:24:28.818052 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40)
10.0.3.246.443 > 10.0.3.1.50524: Flags [R.], cksum 0x012c (correct), seq 0, ack 3836995554, win 0,
length 0
18:24:32.721330 IP (tos 0x0, ttl 64, id 48510, offset 0, flags [DF], proto TCP (6), length 475)
10.0.3.1.60374 > 10.0.3.246.80: Flags [P.], cksum 0x1cc4 (incorrect -> 0x3a4e), seq
580573019:580573442, ack 1982754038, win 237, options [nop,nop,TS val 622821354 ecr 622815632],
length 423
18:24:32.721465 IP (tos 0x0, ttl 64, id 1266, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.246.80 > 10.0.3.1.60374: Flags [.], cksum 0x1b1d (incorrect -> 0x45d7), seq 1, ack 423, win
243, options [nop,nop,TS val 622821355 ecr 622821354], length 0
18:24:32.722098 IP (tos 0x0, ttl 64, id 1267, offset 0, flags [DF], proto TCP (6), length 241)
10.0.3.246.80 > 10.0.3.1.60374: Flags [P.], cksum 0x1bda (incorrect -> 0x855c), seq 1:190, ack 423,
win 243, options [nop,nop,TS val 622821355 ecr 622821354], length 189
18:24:32.722232 IP (tos 0x0, ttl 64, id 48511, offset 0, flags [DF], proto TCP (6), length 52)

10.0.3.1.60374 > 10.0.3.246.80: Flags [.], cksum 0x1b1d (incorrect -> 0x4517), seq 423, ack 190, win
245, options [nop,nop,TS val 622821355 ecr 622821355], length 0

Searching for traffic on two specific ports and from a specific host
# tcpdump -nvvv -i any -c 20 '(port 80 or port 443) and host 10.0.3.169'

While the previous example is great for looking at issues for a multiport protocol; what if
this is a very high traffic webserver? The output from tcpdump may get a bit confusing.
We can narrow down the results even further by adding a host filter. To do this while
maintaining our or expression we can simply wrap the orstatement in parenthesis.
# tcpdump -nvvv -i any -c 20 '(port 80 or port 443) and host 10.0.3.169'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:38:05.551194 IP (tos 0x0, ttl 64, id 63169, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.169.33786 > 10.0.3.246.443: Flags [S], cksum 0x1bcd (incorrect -> 0x0d96), seq 4173164403,
win 29200, options [mss 1460,sackOK,TS val 623024562 ecr 0,nop,wscale 7], length 0
18:38:05.551310 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40)
10.0.3.246.443 > 10.0.3.169.33786: Flags [R.], cksum 0xa64a (correct), seq 0, ack 4173164404, win 0,
length 0
18:38:05.717130 IP (tos 0x0, ttl 64, id 51574, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.169.35629 > 10.0.3.246.80: Flags [S], cksum 0x1bcd (incorrect -> 0xdf7c), seq 1068257453, win
29200, options [mss 1460,sackOK,TS val 623024603 ecr 0,nop,wscale 7], length 0
18:38:05.717255 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.80 > 10.0.3.169.35629: Flags [S.], cksum 0x1bcd (incorrect -> 0xed80), seq 2992472447,
ack 1068257454, win 28960, options [mss 1460,sackOK,TS val 623024603 ecr 623024603,nop,wscale 7],
length 0
18:38:05.717474 IP (tos 0x0, ttl 64, id 51575, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.169.35629 > 10.0.3.246.80: Flags [.], cksum 0x1bc5 (incorrect -> 0x8c87), seq 1, ack 1, win 229,
options [nop,nop,TS val 623024604 ecr 623024603], length 0

You can use the parenthesis multiple times in a single filter, for example the below
command will filter the capture to only packets that are to or from port 80 or
port 443 and from hosts 10.0.3.169 and 10.0.3.1 if they are destined for 10.0.3.246.
# tcpdump -nvvv -i any -c 20 '((port 80 or port 443) and (host 10.0.3.169 or host 10.0.3.1)) and
dst host 10.0.3.246'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:53:30.349306 IP (tos 0x0, ttl 64, id 52641, offset 0, flags [DF], proto TCP (6), length 60)

10.0.3.1.35407 > 10.0.3.246.80: Flags [S], cksum 0x1b25 (incorrect -> 0x4890), seq 3026316656, win
29200, options [mss 1460,sackOK,TS val 623255761 ecr 0,nop,wscale 7], length 0
18:53:30.349558 IP (tos 0x0, ttl 64, id 52642, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.35407 > 10.0.3.246.80: Flags [.], cksum 0x1b1d (incorrect -> 0x3454), seq 3026316657, ack
3657995297, win 229, options [nop,nop,TS val 623255762 ecr 623255762], length 0
18:53:30.354056 IP (tos 0x0, ttl 64, id 52643, offset 0, flags [DF], proto TCP (6), length 475)
10.0.3.1.35407 > 10.0.3.246.80: Flags [P.], cksum 0x1cc4 (incorrect -> 0x10c2), seq 0:423, ack 1, win
229, options [nop,nop,TS val 623255763 ecr 623255762], length 423
18:53:30.354682 IP (tos 0x0, ttl 64, id 52644, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.35407 > 10.0.3.246.80: Flags [.], cksum 0x1b1d (incorrect -> 0x31e6), seq 423, ack 190, win
237, options [nop,nop,TS val 623255763 ecr 623255763], length 0

Understanding the output


Capturing network traffic with tcpdump is hard enough with all of the options, but once
you have that data you have to decipher it. In this section we are going to cover how to
identify the source/destination IP, source/destination Port and the type of packet for the
TCP protocol. While these are all very basic items they are far from the extent of what
you can identify from tcpdump, however this article is meant to be quick and dirty so
we will keep it to the basics. For more information on tcpdump and what is being listed
I suggest checking out the manpages.

Identifying the source and destination


Identifying the source and destination addresses and ports are actually fairly easy.
10.0.3.246.56894 > 192.168.0.92.22: Flags [S], cksum 0xcf28 (incorrect -> 0x0388), seq 682725222, win
29200, options [mss 1460,sackOK,TS val 619989005 ecr 0,nop,wscale 7], length 0

Given the above output we can see that the source ip is 10.0.3.246 the source port
is 56894 and the destination ip is 192.168.0.92 with a destination port of 22. This is
pretty easy to identify once you understand the format of tcpdump. If you haven't
guessed the format yet you can break it down as follows src-ip.src-port > dest-ip.destport: Flags[S] the source is in front of the > and the destination is behind. You can think
of the > as an arrow pointing to the destination.

Identifying the type of packet


10.0.3.246.56894 > 192.168.0.92.22: Flags [S], cksum 0xcf28 (incorrect -> 0x0388), seq 682725222, win
29200, options [mss 1460,sackOK,TS val 619989005 ecr 0,nop,wscale 7], length 0

From the sample above we can tell that the packet is a single SYN packet. We can identify
this by the Flags [S]section of the tcpdump output, different types of packets have
different types of flags. Without going too deep into what types of packets exist within
TCP you can use the below as a cheat sheet for identifying packet types.
[S] - SYN (Start Connection)
[.] - No Flag Set
[P] - PSH (Push Data)
[F] - FIN (Finish Connection)
[R] - RST (Reset Connection)
Depending on the version and output of tcpdump you may also see flags such as [S.] this
is used to indicate aSYN-ACK packet.
An unhealthy example
15:15:43.323412 IP (tos 0x0, ttl 64, id 51051, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.56894 > 192.168.0.92.22: Flags [S], cksum 0xcf28 (incorrect -> 0x0388), seq 682725222,
win 29200, options [mss 1460,sackOK,TS val 619989005 ecr 0,nop,wscale 7], length 0
15:15:44.321444 IP (tos 0x0, ttl 64, id 51052, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.56894 > 192.168.0.92.22: Flags [S], cksum 0xcf28 (incorrect -> 0x028e), seq 682725222,
win 29200, options [mss 1460,sackOK,TS val 619989255 ecr 0,nop,wscale 7], length 0
15:15:46.321610 IP (tos 0x0, ttl 64, id 51053, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.56894 > 192.168.0.92.22: Flags [S], cksum 0xcf28 (incorrect -> 0x009a), seq 682725222,
win 29200, options [mss 1460,sackOK,TS val 619989755 ecr 0,nop,wscale 7], length 0

The above sampling shows an example of an unhealthy exchange, and by unhealthy


exchange for this example that means no exchange. In the above sample we can see
that 10.0.3.246 is sending a SYN packet to host192.168.0.92 however we never see a
response from host 192.168.0.92.

A healthy example
15:18:25.716453 IP (tos 0x10, ttl 64, id 53344, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.34908 > 192.168.0.110.22: Flags [S], cksum 0xcf3a (incorrect -> 0xc838), seq 1943877315,
win 29200, options [mss 1460,sackOK,TS val 620029603 ecr 0,nop,wscale 7], length 0
15:18:25.716777 IP (tos 0x0, ttl 63, id 0, offset 0, flags [DF], proto TCP (6), length 60)
192.168.0.110.22 > 10.0.3.246.34908: Flags [S.], cksum 0x594a (correct), seq 4001145915, ack
1943877316, win 5792, options [mss 1460,sackOK,TS val 18495104 ecr 620029603,nop,wscale 2], length
0
15:18:25.716899 IP (tos 0x10, ttl 64, id 53345, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.246.34908 > 192.168.0.110.22: Flags [.], cksum 0xcf32 (incorrect -> 0x9dcc), ack 1, win 229,
options [nop,nop,TS val 620029603 ecr 18495104], length 0

A healthy example would look like the above, in the above we can see a standard TCP 3way handshake. The first packet above is a SYN packet from host 10.0.3.246 to
host 192.168.0.110, the second packet is a SYN-ACKfrom
host 192.168.0.110 acknowledging the SYN. The final packet is a ACK or rather a SYNACK-ACK from host10.0.3.246 acknowledging that it has received the SYN-ACK. From
this point on there is an established TCP/IP connection.

Packet Inspection
Printing packet data in Hex and ASCII
# tcpdump -nvvv -i any -c 1 -XX 'port 80 and host 10.0.3.1'

A common method of troubleshooting application issues over the network is by


using tcpdump to use the -XXflag to print the packet data in hex and ascii. This is a
pretty helpful command, it allows you to look at both the source, destination, type of
packet and the packet itself. However, I am not a fan of this output. I think it is a bit hard
to read.
# tcpdump -nvvv -i any -c 1 -XX 'port 80 and host 10.0.3.1'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
19:51:15.697640 IP (tos 0x0, ttl 64, id 54313, offset 0, flags [DF], proto TCP (6), length 483)
10.0.3.1.45732 > 10.0.3.246.80: Flags [P.], cksum 0x1ccc (incorrect -> 0x2ce8), seq
3920159713:3920160144, ack 969855140, win 245, options [nop,nop,TS val 624122099 ecr 624117334],
length 431
0x0000: 0000 0001 0006 fe0a e2d1 8785 0000 0800 ................

0x0010: 4500 01e3 d429 4000 4006 49f5 0a00 0301 E....)@.@.I.....
0x0020: 0a00 03f6 b2a4 0050 e9a8 e3e1 39ce d0a4 .......P....9...
0x0030: 8018 00f5 1ccc 0000 0101 080a 2533 58f3 ............%3X.
0x0040: 2533 4656 4745 5420 2f73 6f6d 6570 6167 %3FVGET./somepag
0x0050: 6520 4854 5450 2f31 2e31 0d0a 486f 7374 e.HTTP/1.1..Host
0x0060: 3a20 3130 2e30 2e33 2e32 3436 0d0a 436f :.10.0.3.246..Co
0x0070: 6e6e 6563 7469 6f6e 3a20 6b65 6570 2d61 nnection:.keep-a
0x0080: 6c69 7665 0d0a 4361 6368 652d 436f 6e74 live..Cache-Cont
0x0090: 726f 6c3a 206d 6178 2d61 6765 3d30 0d0a rol:.max-age=0..
0x00a0: 4163 6365 7074 3a20 7465 7874 2f68 746d Accept:.text/htm
0x00b0: 6c2c 6170 706c 6963 6174 696f 6e2f 7868 l,application/xh
0x00c0: 746d 6c2b 786d 6c2c 6170 706c 6963 6174 tml+xml,applicat
0x00d0: 696f 6e2f 786d 6c3b 713d 302e 392c 696d ion/xml;q=0.9,im
0x00e0: 6167 652f 7765 6270 2c2a 2f2a 3b71 3d30 age/webp,*/*;q=0
0x00f0: 2e38 0d0a 5573 6572 2d41 6765 6e74 3a20 .8..User-Agent:.
0x0100: 4d6f 7a69 6c6c 612f 352e 3020 284d 6163 Mozilla/5.0.(Mac
0x0110: 696e 746f 7368 3b20 496e 7465 6c20 4d61 intosh;.Intel.Ma
0x0120: 6320 4f53 2058 2031 305f 395f 3529 2041 c.OS.X.10_9_5).A
0x0130: 7070 6c65 5765 624b 6974 2f35 3337 2e33 ppleWebKit/537.3
0x0140: 3620 284b 4854 4d4c 2c20 6c69 6b65 2047 6.(KHTML,.like.G
0x0150: 6563 6b6f 2920 4368 726f 6d65 2f33 382e ecko).Chrome/38.
0x0160: 302e 3231 3235 2e31 3031 2053 6166 6172 0.2125.101.Safar
0x0170: 692f 3533 372e 3336 0d0a 4163 6365 7074 i/537.36..Accept
0x0180: 2d45 6e63 6f64 696e 673a 2067 7a69 702c -Encoding:.gzip,
0x0190: 6465 666c 6174 652c 7364 6368 0d0a 4163 deflate,sdch..Ac
0x01a0: 6365 7074 2d4c 616e 6775 6167 653a 2065 cept-Language:.e
0x01b0: 6e2d 5553 2c65 6e3b 713d 302e 380d 0a49 n-US,en;q=0.8..I
0x01c0: 662d 4d6f 6469 6669 6564 2d53 696e 6365 f-Modified-Since
0x01d0: 3a20 5375 6e2c 2031 3220 4f63 7420 3230 :.Sun,.12.Oct.20
0x01e0: 3134 2031 393a 3430 3a32 3020 474d 540d 14.19:40:20.GMT.
0x01f0: 0a0d 0a

...

Printing packet data in ASCII only


# tcpdump -nvvv -i any -c 1 -A 'port 80 and host 10.0.3.1'

I tend to prefer to print only the ASCII data, this helps me to quickly identify what is
being sent and what is correct or not correct about the packets data. To print packet data
in only the ascii format you can use the -Aflag.
# tcpdump -nvvv -i any -c 1 -A 'port 80 and host 10.0.3.1'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
19:59:52.011337 IP (tos 0x0, ttl 64, id 53757, offset 0, flags [DF], proto TCP (6), length 406)
10.0.3.1.46172 > 10.0.3.246.80: Flags [P.], cksum 0x1c7f (incorrect -> 0xead1), seq
1552520173:1552520527, ack 428165415, win 237, options [nop,nop,TS val 624251177 ecr 624247749],
length 354
E.....@.@.Ln
...
....\.P\.....I'...........
%5Q)%5C.GET /newpage HTTP/1.1
Host: 10.0.3.246
Connection: keep-alive
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/38.0.2125.101 Safari/537.36
Accept-Encoding: gzip,deflate,sdch
Accept-Language: en-US,en;q=0.8

As you can see from the output above we have successfully captured an http GET request.
Being able to print the packet data in a human readable format is very useful when
troubleshooting application issues where the traffic is not encrypted. If you are
troubleshooting encrypted traffic then printing packet data is not very useful. However, if
you use have the certificates in use you could use commands such as ssldump or
even wireshark.

Non-TCP Traffic
While the majority of this article covered TCP based traffic tcpdump can capture much
more than TCP. It can also be used to capture ICMP, UDP, and ARP packets to name a
few. Below are a few quick examples of non-TCP packets captured by tcpdump.

ICMP packets
# tcpdump -nvvv -i any -c 2 icmp

tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
20:11:24.627824 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 84)
10.0.3.169 > 10.0.3.246: ICMP echo request, id 15683, seq 1, length 64
20:11:24.627926 IP (tos 0x0, ttl 64, id 31312, offset 0, flags [none], proto ICMP (1), length 84)
10.0.3.246 > 10.0.3.169: ICMP echo reply, id 15683, seq 1, length 64

UDP packets
# tcpdump -nvvv -i any -c 2 udp
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
20:12:41.726355 IP (tos 0xc0, ttl 64, id 0, offset 0, flags [DF], proto UDP (17), length 76)
10.0.3.246.123 > 198.55.111.50.123: [bad udp cksum 0x43a9 -> 0x7043!] NTPv4, length 48
Client, Leap indicator: clock unsynchronized (192), Stratum 2 (secondary reference), poll 6 (64s),
precision -22
Root Delay: 0.085678, Root dispersion: 57.141830, Reference-ID: 199.102.46.75
Reference Timestamp: 3622133515.811991035 (2014/10/12 20:11:55)
Originator Timestamp: 3622133553.828614115 (2014/10/12 20:12:33)
Receive Timestamp:

3622133496.748308420 (2014/10/12 20:11:36)

Transmit Timestamp: 3622133561.726278364 (2014/10/12 20:12:41)


Originator - Receive Timestamp: -57.080305658
Originator - Transmit Timestamp: +7.897664248
20:12:41.748948 IP (tos 0x0, ttl 54, id 9285, offset 0, flags [none], proto UDP (17), length 76)
198.55.111.50.123 > 10.0.3.246.123: [udp sum ok] NTPv4, length 48
Server, Leap indicator: (0), Stratum 3 (secondary reference), poll 6 (64s), precision -20
Root Delay: 0.054077, Root dispersion: 0.058944, Reference-ID: 216.229.0.50
Reference Timestamp: 3622132887.136984840 (2014/10/12 20:01:27)
Originator Timestamp: 3622133561.726278364 (2014/10/12 20:12:41)
Receive Timestamp:

3622133618.830113530 (2014/10/12 20:13:38)

Transmit Timestamp: 3622133618.830129086 (2014/10/12 20:13:38)


Originator - Receive Timestamp: +57.103835195
Originator - Transmit Timestamp: +57.103850722

Socket options SO_REUSEADDR and SO_REUSEPORT


Welcome to the wonderful world of portability... or rather the lack of it. Before we start analyzing these two
options in detail and take a deeper look how different operating systems handle them, it should be noted
that the BSD socket implementation is the mother of all socket implementations. Basically all other
systems copied the BSD socket implementation at some point of time (or at least its interfaces) and then

started to evolving it on their own. Of course the BSD socket implementation was evolved as well at the
same time and thus systems that copied it later got features that were lacking in systems that copied it
earlier. Understanding the BSD socket implementation is the key to understanding all other socket
implementations, so you should read about it even if you don't care to ever write code for a BSD system.
There are a couple of basics you should know before we look at these two options. A TCP/UDP
connection is identified by a tuple of five values:
{<protocol>, <src addr>, <src port>, <dest addr>, <dest port>}

Any unique combination of these values identifies a connection. As a result, no two connections can have
the same five values, otherwise the system would not be able to distinguish these connections any longer.
The protocol of a socket is set when a socket is created with the socket() function. The source address
and port are set with the bind() function. The destination address and port are set with
the connect() function. Since UDP is a connectionless protocol, UDP sockets can be used without
connecting them. Yet it is allowed to connect them and in some cases very advantageous for your code
and general application design. In connectionless mode, UDP sockets that were not explicitly bound when
data is sent over them for the first time are usually automatically bound by the system, as an unbound
UDP socket cannot receive any (reply) data. Same is true for an unbound TCP socket, it is automatically
bound before it will be connected.
If you explicitly bind a socket, it is possible to bind it to port 0, which means "any port". Since a socket
cannot really be bound to all existing ports, the system will have to choose a specific port itself in that case
(usually from a predefined, OS specific range of source ports). A similar wildcard exists for the source
address, which can be "any address" (0.0.0.0 in case of IPv4 and :: in case of IPv6). Unlike in case of
ports, a socket can really be bound to "any address" which means "all source IP addresses of all local
interfaces". If the socket is connected later on, the system has to choose a specific source IP address,
since a socket cannot be connected and at the same time be bound to any local IP address. Depending
on the destination address and the content of the routing table, the system will pick an appropriate source
address and replace the "any" binding with a binding to the chosen source IP address.
By default, no two sockets can be bound to the same combination of source address and source port. As
long as the source port is different, the source address is actually irrelevant.
Binding socketA to A:X and socketB to B:Y, where A and B are addresses and X and Y are ports, is always
possible as long as X != Y holds true. However, even if X == Y, the binding is still possible as long as A !=
B holds true. E.g. socketA belongs to a FTP server program and is bound
to 192.168.0.1:21 and socketB belongs to another FTP server program and is bound to 10.0.0.1:21, both
bindings will succeed. Keep in mind, though, that a socket may be locally bound to "any address". If a
socket is bound to 0.0.0.0:21, it is bound to all existing local addresses at the same time and in that case
no other socket can be bound to port 21, regardless which specific IP address it tries to bind to,
as 0.0.0.0 conflicts with all existing local IP addresses.

Anything said so far is pretty much equal for all major operating system. Things start to get OS specific
when address reuse comes into play. We start with BSD, since as I said above, it is the mother of all
socket implementations.

BSD SO_REUSEADDR (BSD is not Linux in this articles context)


If SO_REUSEADDR is enabled on a socket prior to binding it, the socket can be successfully bound unless
there is a conflict with another socket bound to exactly the same combination of source address and port.
Now you may wonder how is that any different than before? The keyword is
"exactly". SO_REUSEADDR mainly changes the way how wildcard addresses ("any IP address") are treated
when searching for conflicts.
Without SO_REUSEADDR, binding socketA to 0.0.0.0:21 and then binding socketB to 192.168.0.1:21 will fail
(with error EADDRINUSE), since 0.0.0.0 means "any local IP address", thus all local IP addresses are
considered in use by this socket and this includes 192.168.0.1, too. With SO_REUSEADDR it will succeed,
since 0.0.0.0 and 192.168.0.1 are not exactly the same address, one is a wildcard for all local addresses
and the other one is a very specific local address. Note that the statement above is true regardless in
which order socketA and socketB are bound; without SO_REUSEADDR it will always fail,
with SO_REUSEADDR it will always succeed.
To give you a better overview, let's make a table here and list all possible combinations:
SO_REUSEADDR

socketA

socketB

Result

--------------------------------------------------------------------ON/OFF

192.168.0.1:21 192.168.0.1:21

ON/OFF

192.168.0.1:21

ON/OFF
OFF
OFF
ON
ON
ON/OFF

Error (EADDRINUSE)

10.0.0.1:21

OK

10.0.0.1:21 192.168.0.1:21

OK

0.0.0.0:21 192.168.1.0:21
192.168.1.0:21

0.0.0.0:21

0.0.0.0:21 192.168.1.0:21

Error (EADDRINUSE)
Error (EADDRINUSE)
OK

192.168.1.0:21

0.0.0.0:21

OK

0.0.0.0:21

0.0.0.0:21

Error (EADDRINUSE)

The table above assumes that socketA has already been successfully bound to the address given
for socketA, then socketB is created, either gets SO_REUSEADDR set or not, and finally is bound to the
address given for socketB. Result is the result of the bind operation for socketB. If the first column
says ON/OFF, the value of SO_REUSEADDR is irrelevant to the result.
Okay, SO_REUSEADDR has an effect on wildcard addresses, good to know. Yet that isn't it's only effect it
has. There is another well known effect which is also the reason why most people use SO_REUSEADDR in

server programs in the first place. For the other important use of this option we have to take a deeper look
on how the TCP protocol works.
A socket has a send buffer and if a call to the send() function succeeds, it does not mean that the
requested data has actually really been sent out, it only means the data has been added to the send
buffer. For UDP sockets, the data is usually sent pretty soon, if not immediately, but for TCP sockets, there
can be a relatively long delay between adding data to the send buffer and having the TCP implementation
really send that data. As a result, when you close a TCP socket, there may still be pending data in the
send buffer, which has not been sent yet but your code considers it as sent, since the send() call
succeeded. If the TCP implementation was closing the socket immediately on your request, all of this data
would be lost and your code wouldn't even know about that. TCP is said to be a reliable protocol and
losing data just like that is not very reliable. That's why a socket that still has data to send will go into a
state called TIME_WAIT when you close it. In that state it will wait until all pending data has been
successfully sent or until a timeout is hit, in which case the socket is closed forcefully.
The amount of time the kernel will wait before it closes the socket, regardless if it still has pending send
data or not, is called the Linger Time. The Linger Time is globally configurable on most systems and by
default rather long (two minutes is a common value you will find on many systems). It is also configurable
per socket using the socket option SO_LINGER which can be used to make the timeout shorter or longer,
and even to disable it completely. Disabling it completely is a very bad idea, though, since closing a TCP
socket gracefully is a slightly complex process and involves sending forth and back a couple of packets
(as well as resending those packets in case they got lost) and this whole close process is also limited by
the Linger Time. If you disable lingering, your socket may not only lose pending data, it is also always
closed forcefully instead of gracefully, which is usually not recommended. The details about how a TCP
connection is closed gracefully are beyond the scope of this answer, if you want to learn more about, I
recommend you have a look at this page. And even if you disabled lingering with SO_LINGER, if your
process dies without explicitly closing the socket, BSD (and possibly other systems) will linger
nonetheless, ignoring what you have configured. This will happen for example if your code just
calls exit() (pretty common for tiny, simple server programs) or the process is killed by a signal (which
includes the possibility that it simply crashes because of an illegal memory access). So there is nothing
you can do to make sure a socket will never linger under all circumstances.
The question is, how does the system treat a socket in state TIME_WAIT? If SO_REUSEADDR is not set, a
socket in state TIME_WAIT is considered to still be bound to the source address and port and any attempt
to bind a new socket to the same address and port will fail until the socket has really been closed, which
may take as long as the configured Linger Time. So don't expect that you can rebind the source address
of a socket immediately after closing it. In most cases this will fail. However, if SO_REUSEADDR is set for
the socket you are trying to bind, another socket bound to the same address and port in
state TIME_WAIT is simply ignored, after all its already "half dead", and your socket can bind to exactly the
same address without any problem. In that case it plays no role that the other socket may have exactly the
same address and port. Note that binding a socket to exactly the same address and port as a dying socket
in TIME_WAIT state can have unexpected, and usually undesired, side effects in case the other socket is
still "at work", but that is beyond the scope of this answer and fortunately those side effects are rather rare
in practice.

There is one final thing you should know about SO_REUSEADDR. Everything written above will work as long
as the socket you want to bind to has address reuse enabled. It is not necessary that the other socket, the
one which is already bound or is in a TIME_WAIT state, also had this flag set when it was bound. The code
that decides if the bind will succeed or fail only inspects the SO_REUSEADDRflag of the socket fed into
the bind() call, for all other sockets inspected, this flag is not even looked at.

SO_REUSEPORT
SO_REUSEPORT is what most people would expect SO_REUSEADDR to be. Basically, SO_REUSEPORTallows

you to bind an arbitrary number of sockets to exactly the same source address and port as long
as all prior bound sockets also had SO_REUSEPORT set before they were bound. If the first socket that is
bound to an address and port does not have SO_REUSEPORT set, no other socket can be bound to exactly
the same address and port, regardless if this other socket has SO_REUSEPORTset or not, until the first
socket releases its binding again. Unlike in case of SO_REUESADDR the code handling SO_REUSEPORT will
not only verify that the currently bound socket has SO_REUSEPORT set but it will also verify that the socket
with a conflicting address and port had SO_REUSEADDR set when it was bound.
SO_REUSEPORT does not imply SO_REUSEADDR. This means if a socket did not have SO_REUSEPORTset

when it was bound and another socket has SO_REUSEPORT set when it is bound to exactly the same
address and port, the bind fails, which is expected, but it also fails if the other socket is already dying and
is in TIME_WAIT state. To be able bind a socket to the same addresses and port as another socket
in TIME_WAIT state requires either SO_REUSEADDR to be set on that socket or SO_REUSEPORT must have
been set on both sockets prior to binding them. Of course it is allowed to set
both, SO_REUSEPORT and SO_REUSEADDR, on a socket.
There is not much more to say about SO_REUSEPORT other than that it was added later
than SO_REUSEADDR, that's why you will not find it in many socket implementations of other systems,
which "forked" the BSD code before this option was added, and that there was no way to bind two sockets
to exactly the same socket address in BSD prior to this option.

Connect() Returning EADDRINUSE?


Most people know that bind() may fail with the error EADDRINUSE, however, when you start playing around
with address reuse, you may run into the strange situation that connect() fails with that error as well. How
can this be? How can a remote address, after all that's what connect adds to a socket, be already in use?
Connecting multiple sockets to exactly the same remote address has never been a problem before, so
what's going wrong here?
As I said on the very top of my reply, a connection is defined by a tuple of five values, remember? And I
also said, that these five values must be unique otherwise the system cannot distinguish two connections
any longer, right? Well, with address reuse, you can bind two sockets of the same protocol to the same
source address and port. That means three of those five values are already the same for these two
sockets. If you now try to connect both of these sockets also to the same destination address and port,

you would create two connected sockets, whose tuples are absolutely identical. This cannot work, at least
not for TCP connections (UDP connections are no real connections anyway). If data arrived for either one
of the two connections, the system could not tell which connection the data belongs to. At least the
destination address or destination port must be different for either connection, so that the system has no
problem to identify to which connection incoming data belongs to.
So if you bind two sockets of the same protocol to the same source address and port and try to connect
them both to the same destination address and port, connect() will actually fail with the
error EADDRINUSE for the second socket you try to connect, which means that a socket with an identical
tuple of five values is already connected.

Multicast Addresses
Most people ignore the fact that multicast addresses exist, but they do exist. While unicast addresses are
used for one-to-one communication, multicast addresses are used for one-to-many communication. Most
people got aware of multicast addresses when they learned about IPv6 but multicast addresses also
existed in IPv4, even though this feature was never widely used on the public Internet.
The meaning of SO_REUSEADDR changes for multicast addresses as it allows multiple sockets to be
bound to exactly the same combination of source multicast address and port. In other words, for multicast
addresses SO_REUSEADDR behaves exactly as SO_REUSEPORT for unicast addresses. Actually the code
treats SO_REUSEADDR and SO_REUSEPORT identically for multicast addresses, that means you could say
that SO_REUSEADDR implies SO_REUSEPORT for all multicast addresses and the other way round.

FreeBSD/OpenBSD/NetBSD
All these are rather late forks of the original BSD code, that's why they all three offer the same options as
BSD and they also behave the same way as in BSD.

MacOS X
At its very core, MacOS X is simply a BSD-style UNIX, based on a rather late fork of the BSD code, which
was even synchronized with FreeBSD 5 for the Mac OS 10.3 release. That's why MacOS X offers the
same options as BSD and they also behave the same way as in BSD.

iOS
iOS is just modified MacOS X at its core, so everything that applies to MacOS X also applies to iOS.

Linux
Prior to Linux 3.9, only the option SO_REUSEADDR existed. This option behaves generally the as in BSD
with two important exceptions. One exception is that a if a listening (server) TCP socket is already bound
to a wildcard IP address and a specific port, no other TCP socket can be bound to the same port,
regardless whether either one or both sockets have this flag set. Not even if it would use a more specific
address (as is allowed in case of BSD). This restriction does not apply to non-listening (client) TCP
sockets and it is also possible to first bind a listening TCP socket to a specific IP address and port
combination and later on bind another one to a wildcard IP address and the same port. The second
exception is that for UDP sockets this option behaves exactly like SO_REUSEPORT in BSD, so two UDP
sockets can be bound to exactly the same address and port combination as long as both had this flag set
before they were bound.
Linux 3.9 added the option SO_REUSEPORT to Linux as well. This option allows two (or more) sockets, TCP
or UDP, listening (server) or non-listening (client), to be bound to exactly the same address and port
combination as long as all sockets (including the very first one) had this flag set prior to binding them. To
prevent "port hijacking", there is one special limitation, though: All sockets that want to share the same
address and port combination must belong to processes that share the same effective user ID! So one
user cannot "steal" ports of another user. Additionally the kernel performs some "special magic"
for SO_REUSEPORT sockets that isn't found in any other operating system so far: For UDP sockets, it tries
to distribute datagrams evenly, for TCP listening sockets, it tries to distribute incoming connect requests
(those accepted by calling accept()) evenly across all the sockets that share the same address and port
combination. That means while it is more or less random which socket receives a datagram or connect
request in other operating systems that allow full address reuse, Linux tries to optimize distribution so that,
for example, multiple instances of a simple server process can easily use SO_REUSEPORT sockets to
achieve a kind of simple load balancing and that absolutely for free as the kernel is doing "all the hard
work" for them.

Android
Even though the whole Android system is somewhat different from most Linux distributions, at its core
works a slightly modified Linux kernel, thus everything that applies to Linux applies to Android as well.

Windows
Windows only knows the SO_REUSEADDR option, there is no SO_REUSEPORT. Setting SO_REUSEADDRon a
socket in Windows behaves like setting SO_REUSEPORT and SO_REUSEADDR on a socket in BSD, with one

exception: A socket with SO_REUSEADDR can always bind to exactly the same source address and port as
an already bound socket, even if the other socket did not have this option set when it was bound.
This behavior is somewhat dangerous because it allows an application "to steal" the connected port of
another application. Needless to say, this can have major security implications. Microsoft realized that this
might be a problem and thus added another socket option SO_EXCLUSIVEADDRUSE.
Setting SO_EXCLUSIVEADDRUSE on a socket makes sure that if the binding succeeds, the combination of
source address and port is owned exclusively by this socket and no other socket can bind to them, not
even if it has SO_REUSEADDR set.

Solaris
Solaris is the successor of SunOS. SunOS was originally based on a fork of BSD, SunOS 5 and later was
based on a fork of SVR4, however SVR4 is a merge of BSD, System V, and Xenix, so up to some degree
Solaris is also a BSD fork, and a rather early one. As a result Solaris only knows SO_REUSEADDR, there is
no SO_REUSEPORT. The SO_REUSEADDR behaves pretty much the same as it does in BSD. As far as I
know there is no way to get the same behavior as SO_REUSEPORT in Solaris, that means it is not possible
to bind two address to exactly the same address and port.
Similar to Windows, Solaris has an option to give a socket an exclusive binding. This option is
named SO_EXCLBIND. If this option is set on a socket prior to binding it, setting SO_REUSEADDR on another
socket has no effect if the two sockets are tested for an address conflict. E.g. if socketA is bound to a
wildcard address and socketB has SO_REUSEADDR enabled and is bound to a non-wildcard address and
the same port as socketA, this bind will normally succeed, unless socketAhad SO_EXCLBIND enabled, in
which case it will fail regardless the SO_REUSEADDR flag of socketB.

You might also like