linux kernel 2.4.5 ipv4 little explanation herein socket layer from: http: //www.xici.net/ Author: scancat (2001-07-30 10:00:00)
Disclaimer: I am limited, so I can't get better, don't yell at me :) 1. New Socket function original: static int inet_create (struct socket * sock, int protocol) In Net / IPv4 / AF_INET.C Detailed explanation static int INET_CREATE (STRUCT SOCKET * SOCK, INT Protocol) {structure {struct Sock * SK; struct proto * prot; sock-> state = ss_unconnected; / * set state is not connected * / SK = SK_alloc (PF_INET, GFP_KERNEL, 1); / * Apply for SOCK * / / * Net / Core / Sock.c * / if (SK == NULL) GOTO DO_OOM; Switch (Sock-> Type) {Case Sock_Stream: / * TCP Protocol * / IF (Protocol && ! protocol = IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; prot = & tcp_prot; / * tcp_prot defined in net / ipv4 / tcp_ipv4.c * / sock-> ops = & inet_stream_ops; / * socket for operating a STREAM * / break; case SOCK_SEQPACKET : / * not supported * / goto free_and_badtype; case SOCK_DGRAM: / * UDP protocol * / if (! protocol && protocol = IPPROTO_UDP) goto free_and_noproto; protocol = IPPROTO_UDP; sk-> no_check = UDP_CSUM_DEFAULT; prot = & udp_prot; / * udp_prot defined In Net / IPv4 / Udp.c * / Sock-> OPS = & INET_DGRAM_OPS; / * For DGRAM Socket Operation * / Break; Case Sock_RAW: / * RA W * / if (! Capable (Capable (Capable (Capable (Capable (Capable) to determine if there is a right to establish SOCK_RAW * / GOTO FREE_AND_BADPERM; if (! Protocol) / * protocol can not be 0 * / goto free_and_noproto; prot = & rAW_PROT; / * RAW_PROT definition in Net /IPv4/Raw.c * / SK-> Reuse = 1; / * Allow addresses to reuse * / SK-> Num = protocol; sock-> ops = & inet_dgram_ops; / * RAW Some features and DGRAM identical * / if (Protocol == IPPROTO_RAW) sk-> protinfo.af_inet.hdrincl = 1; / * ip allow their own custom header * / break; default: goto free_and_badtype;} if (ipv4_config.no_pmtu_disc) sk-> protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; else SK-> protinfo.af_Inet.pmtudisc =
IP_PMTUDISC_WANT; SK-> Protinfo.af_Inet.ID = 0; SOCK_INIT_DATA (SOCK, SK); / * Initialization Some Data * / / * Net / Core / Sock.c * / SK-> Destruct = INET_SOCK_DESTRUCT; / * When destroyed Socket Time to call inet_sock_destruct * / SK-> zapped = 0; SK-> Family = PF_INET; SK-> Prot = Prot; SK-> Prot = Prot; SK-> Backlog_rcv = Prot-> Backlog_rcv; / * prot-> backlog_rcv ( See the definition of each type * / SK-> protinfo.af_ININET.TTL = SYSCTL_IP_DEFAULT_TTL; / * Set the default TTL * / / * Modification / Proc / Sys / Net / IPv4 / IP_DEFAULT_TTL * / SK-> Protinfo.af_INET.MC_LOOP = 1; SK-> Protinfo.af_INET.MC_TTL = 1; SK-> Protinfo.af_Inet.mc_index = 0; SK-> Protinfo.af_Inet.mc_list = null; #ifdef = NULL; #IFDEF INET_REFCNT_DEBUG Atomic_inc (& INET_SOCK_NR); #ENDIF IF (SK-> Num) {/ * it assumes That Any Protocol Which Allows * The user to assign a number at solidket * creation time automatically * shares. * / SK-> Sport = HTONS (SK-> Num); / * Setting up local port * / / * Add to protocol hash chains. * / SK-> Prot-> hash (s);} if (SK-> prot-> init) {int err = SK-> prot-> init (sk); / * protocol Initialization to Socket * / if (Err! = 0) { inet_sock_release (sk); return (err);}} return (0); free_and_badtype: sk_free (sk); / * release memory * / return -ESOCKTNOSUPPORT; free_and_badperm: sk_free (sk); return -EPERM; free_and_noproto: sk_free (sk ); return -EPROTONOSUPPORT; do_oom: return -ENOBUFS;} in the net / core / sock.c void sock_init_data (struct socket * sock, struct sock * sk) {skb_queue_head_init (& sk-> receive_queue); / * initialize queue accepts 3 , Send, error * / SKB_QUE_HEAD_INIT (& SK-> Write_Queue); SKB_QUEUE_HEAD_INIT (& SK-> Error_Queue); init_timer (& SK-> Timer; / * Initialization Timer * / SK-> allocation = gfp_kernel; SK->
RCVBUF = SYSCTL_RMEM_DEFAULT; SK-> SNDBUF = SYSCTL_WMEM_DEFAULT; SK-> State = TCP_Close; SK-> Zapped = 1; SK-> Socket = Sock; if (Sock) {SK-> Type = SOCK-> Type; SK-> sleep = & sock-> wait; sock-> sk = sk;} else sk-> sleep = NULL; sk-> dst_lock = RW_LOCK_UNLOCKED; sk-> callback_lock = RW_LOCK_UNLOCKED; / * sock_def_wakeup (), sock_def_readable (), sock_def_write_space () , sock_def_error_report (), sock_def_destruct () in net / core / sock.c * / sk-> state_change = sock_def_wakeup; sk-> data_ready = sock_def_readable; sk-> write_space = sock_def_write_space; sk-> error_report = sock_def_error_report; sk-> destruct = SOCK_DEF_DESTRUCT; SK-> Peercred.PID = 0; SK-> Peercred.uid = -1; SK-> Peercred.gid = -1; SK-> RCVLOWAT = 1; SK-> RCVTIMEO = max_schedule_timeout; / * Settings Accept , transmission timeout * / sk-> sndtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set (& sk-> refcnt, 1);} 1.1 SOCK_STREAM initialized at net / ipv4 / tcp_ipv4.c static int tcp_v4_init_sock (struct sock * sk) {struct tcp_opt * tp = & (SK-> TP_PINFO.AF_TCP); skb_queue_head_init (& tp-> out_of_order_queue); tcp_init_xmit_timers (sk); tcp_prequeue_init (tp); tp-> rto = TCP_TIMEOUT_INIT; tp-> mdev = TCP_TIMEOUT_INIT; / * So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them -DaveM * / tp-> snd_cwnd = 2;. / * See draft-stevens-tcpca-spec-01 for discussion of the * Initialization of these Values. * / TP-> SND_SSTHRESH =
0x7fffffff; / * Infinity * / tp-> snd_cwnd_clamp = ~ 0; tp-> mss_cache = 536; tp-> reordering = sysctl_tcp_reordering; sk-> state = TCP_CLOSE; sk-> write_space = tcp_write_space; / * tcp_write_space () in the net /ipv4/tcp.c * / sk-> use_write_queue = 1; sk-> tp_pinfo.af_tcp.af_specific = & ipv4_specific; / * ipv4_specific in net / ipv4 / tcp_ipv4.c * / sk-> sndbuf = sysctl_tcp_wmem [1]; / * Set the transmission and reception buffer size * / SK-> RCVBUF = SYSCTL_TCP_RMEM [1]; / * sysctl_tcp_ * in Net / IPv4 / TCP.c * / atomic_inc_inc (& TCP_SOCKETS_ALLOCATED); / * TCP_SOCKETS_ALLOCATED is the current TCP Socket * / Return 0;} SOCK_DGRAM No 1.2 Sock_RAW Initialization In Net / IPv4 / Raw.c Static Int Raw_init (Struct Sock * SK) {Struct Raw_opt * TP = & (SK-> TP_PINFO.TP_RAW4); if (SK-> Num = = IPPROTO_ICMP) memset (& tp-> filter, 0, sizeof (tp-> filter)); return 0;} 2.Server 2.1 bind static int inet_bind (struct socket * sock, struct sockaddr * uaddr, int addr_len) {struct sockaddr_in * addr = (struct socmeddr_in *) uaddr; struct soc check * SK = SOCK-> SK; UNSIGNED SHORT SNUM; INT CHK_ADDR_RET; in T Err; / * if The socket Has Its OWN BIND FUNCTION THEN USE. (RAW) * / IF (SK-> Prot-> Bind) Return SK-> Prot-> Bind (SK, Uaddr, Addr_len); / * Only SOCK_RAW defines its own bind function * / if (addr_len
/ * Inet_addr_type return type of address * / / * in net / ipv4 / fib_frontend.c * / / * Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since * allowing applications to make a non-local bind solves * several problems with systems using dynamic addressing. * (ie. your servers still start up even if your ISDN link * is temporarily down) * / if (sysctl_ip_nonlocal_bind == 0 && sk-> protinfo.af_inet !! .freebind == 0 && addr-> sin_addr.s_addr = INADDR_ANY && chk_addr_ret = RTN_LOCAL && chk_addr_ret = RTN_MULTICAST && chk_addr_ret = RTN_BROADCAST) return -EADDRNOTAVAIL;!! snum = ntohs (addr-> sin_port); if (snum && snum
} IF (SK-> RCV_SADDR) SK-> UserLocks | = SOCK_BINDADDR_LOCK; IF (SNUM) SK-> UserLocks | = SOCK_BINDPORT_LOCK; SK-> Sport = Htons (SK-> Num); SK-> DADDR = 0; SK- > DPORT = 0; SK_DST_RESET (SK); Err = 0; OUT: Release_Sock (SK); Return Err;} Sock_Stream and Sock_DGRAM with default bind 2.1.1 sock_raw bind in Net / IPv4 / Raw.c static int Raw_bind struct sock * sk, struct sockaddr * uaddr, int addr_len) {struct sockaddr_in * addr = (struct sockaddr_in *) uaddr; int ret = -EINVAL; int chk_addr_ret;! if (sk-> state = TCP_CLOSE || addr_len / * Really, if The Socket IS Already in Listen State * We can online allow the backlog to be adjusted. * / IF (Old_State! = TCP_Listen) { Err = TCP_Listen_Start (SK); / * Really implemented TCP protocol Listen * / IF (ERR) Goto Out; } SK-> MAX_ACK_BACKLOG = backlog; Err = 0; OUT: Release_sock (sk); Return ERR; } TCP_LISTEN_START in Net / IPv4 / TCP.H INT TCP_LISTEN_START (STRUCT SOCK * SK) { Struct TCP_OPT * TP = & (SK-> TP_PINFO.AF_TCP); Struct tcp_listen_opt * lopt; SK-> MAX_ACK_BACKLOG = 0; SK-> ACK_BACKLOG = 0; TP-> accept_queue = tp-> accept_queue_tail = null; TP-> SYN_WAIT_LOCK = RW_LOCK_UNLOCKED; TCP_DELACK_INIT (TP); / * TP Qing 0 * / / * Incrude / net / tcp.h * / LOPT = kmalloc (SIZEOF (struct tcp_listen_opt), gfp_kernel); IF (! LOPT) Return -ENMEM; MEMSET (LOPT, 0, SIZEOF (Struct TCP_Listen_opt); For (LOPT-> max_qlen_log = 6;; lopt-> max_qlen_log ) IF ((1 < MAX_QLEN_LOG)> = sysctl_max_syn_backlog) Break; Write_lock_bh (& TP-> SYN_WAIT_LOCK); TP-> listen_opt = LOPT; Write_unlock_bh (& TP-> SYN_WAIT_LOCK); / * There is race window here: we announce Ourslves Listening, * But this transition is still not validated by get_port (). * IT IS OK, Because this socket Enters to Hash Table ONLY * After Validation is Complete. * / SK-> State = TCP_Listen; IF (SK-> Prot-> get_port (SK, SK-> NUM) == 0) {/ * confirmation address is not reused * / SK-> Sport = HTONS (SK-> NUM); / * Set the source port * / SK_DST_RESET (SK); SK-> Prot-> Hash (SK); / * Add port to the Hash table * / Return 0; } SK-> State = TCP_Close; Write_lock_bh (& TP-> SYN_WAIT_LOCK); TP-> Listen_opt = NULL; Write_unlock_bh (& TP-> SYN_WAIT_LOCK); Kfree (LOPT); Return-eaddrinuse; } SOCK_DGRAM and SOCK_RAW do not support Listen 2.3 ACCEPT 2.3.1 SOCK_STREAM Accept In Net / IPv4 / AF_INET.C INT INET_ACCEPT (Struct Socket * Sock, Struct Socket * Newsock, Int Flags) { Struct Sock * SK1 = SOCK-> SK; Struct Sock * SK2; Int err = -einval; IF ((SK2 = SK1-> Prot-> Accept (SK1, FLAGS, & ERR) == NULL) Goto do_err; LOCK_SOCK (SK2); Bug_trap ((1 < State) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)); SOCK_GRAFT (SK2, Newsock); / * Connected SK2 to Newsock * / / * In include / net / sock.h * / Newsock-> state = ss_connected; Release_sock (SK2); Return 0; Do_err: Return ERR; } SOCK_DGRAM and SOCK_RAW do not support Accept 2.3.1.1 ACCEPT for TCP Protocol In Net / IPv4 / TCP.c Struct Sock * TCP_ACCEPT (Struct Sock * SK, INT FLAGS, INT * ERR) { Struct TCP_OPT * TP = & SK-> TP_PINFO.AF_TCP; Struct Open_Request * Req; Struct Sock * newsk; Int error; LOCK_SOCK (SK); / * We need to make sure what this socket is listening, * AND That it has something pending. * / Error = - EINVAL; IF (SK-> State! = TCP_LISTEN) / * Check if the socket is in the Listen status * / Goto Out; / * Find Already Established Connection * / if (! tp-> accept_queue) {/ * Judging whether the Accept queue is ready * / Long Timeo = SOCK_RCVTIMEO (SK, Flags & O_nonblock); / * Judgment is a plug mode * / / * In include / net / sock.h * / / * If this is a non blocking socket don't sleep * / Error = -eagain; If (! timeo) / * does not block the mode, return directly * / Goto Out; Error = Wait_for_connect (SK, Timeo); / * Enter idle waiting connection * / IF (Error) Goto Out; } Req = TP-> Accept_queue; IF ((tp-> accept_queue = req-> dl_next) == NULL) TP-> accept_queue_tail = null; Newsk = req-> SK; TCP_ACCEPTQ_REMOVED (SK); / * SK Current connection 1 * / / * In include / net / tcp.h * / TCP_OPENREQ_FASTFREE (Req); / * Release memory * / / * In include / net / tcp.h * / Bug_trap (newsk-> state! = Tcp_syn_recv); Release_sock (sk); Return Newsk; Out: Release_sock (sk); * err = error; Return NULL; } / * Only when socket is a plug mode, the function will be called * / / * In Net / IPv4 / TCP.c * / Static int WAIT_FOR_CONNECT (Struct Sock * SK, Long Timeo) { Declare_WaitQueue (Wait, CURRENT); Int Err; / * * True Wake-One Mechanism for Incoming Connections: ONLY * ONE Process Gets Woken Up, NOT The 'Whole Herd'. * Since We do not 'Race & Poll' for Established Sockets * Anymore, The Common Case Will Execute The loop ONLY ONCE. * * SUBTLE ISSUE: "Add_wait_queue_exclusive ()" Will Be Added *After any current Non-Exclusive Waiters, And We know That * IT WILL Always _Stay_After Any New Non-Exclusive Waiters * Because All Non-Exclusive Waiters Areadded At The * Beginning of the Wait-Queue. As Such, It's Ok to "DROP" * Our Exclusiveness Temporarily When We get Woken Up With WITHOUT * Having to remove and re-insert US on The Wait Queue. * / Add_wait_queue_exclusive (SK-> Sleep, & Wait); For (;;) { Current-> State = Task_Interruptible; Release_sock (sk); IF (SK-> TP_PINFO.AF_TCP.ACCEPT_QUEUE == NULL) Timeo = Schedule_timeout; / * Sleep Timeo Duration * / LOCK_SOCK (SK); Err = 0; IF (SK-> TP_PINFO.AF_TCP.ACCEPT_QUEUE) / * Accept Queue available * / / * That is, there is a connection entry * / Break; Err =-EinVal; IF (SK-> State! = TCP_LISTEN) Break; Err = SOCK_INTR_ERRNO (TIMEO); IF (Signal_Pending (CURRENT)) Break; Err = -eagain; IF (! Timeo) Break; } Current-> State = Task_Running; REMOVE_WAIT_QUEUE (SK-> Sleep, & Wait); Return ERR; } 3.client 3.1 Connect 3.1.1 Sock_Stream CONNECT In Net / IPv4 / AF_INET.C INT INET_STREAM_CONNECT (Struct Socket * Sock, Struct SockAddr * Uaddr, Int Addr_len, int FLAGS { Struct Sock * SK = SOCK-> SK; int ERR; Long Timeo; LOCK_SOCK (SK); IF (uaddr-> sa_family == AF_UNSPEC) { Err = SK-> Prot-> Disconnect (SK, FLAGS); / * Close connection * / SOCK-> State = Err? ss_disconnecting: ss_unconnected; Goto Out; } Switch (SOCK-> State) { DEFAULT: Err =-EinVal; Goto Out; Case SS_CONNECTED: Err = -eisconn; Goto Out; Case SS_CONNECTING: Err = -EalReady; / * Fall Out of Switch with Err, Set for this State * / Break; Case SS_UNCONNECTED: Err = -eisconn; IF (SK-> State! = TCP_CLOSE) Goto Out; Err = -eagain; IF (SK-> NUM == 0) { IF (SK-> Prot-> get_port (SK, 0)! = 0) / * Whether to reuse * / Goto Out; SK-> Sport = HTONS (SK-> NUM); } Err = SK-> Prot-> Connect (SK, UADDR, AddR_len); / * CONNECT * / IF (Err <0) Goto Out; Sock-> state = ss_connecting; / * socket status is set to connect * / / * Just Entered SS_CONNECTING STATE; THE ONLY * Difference is that Return Value in non-blocking * Case is EinProgress, Rather Than Ealready. * / Err = - EinProgress; Break; } Timeo = SOCK_SNDTIMEO (SK, FLAGS & O_NONBLOCK); / * Is it a plug mode * / / * In include / net / sock.h * / IF ((1 < State) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {/ * connection is completed * / / * Error Code is set above * / IF (! Timeo ||! inet_wait_for_connect (sk, timeo)) / * Non-blocking mode returns now * / / * Clip mode call INET_WAIT_FOR_CONNECT () * / Goto Out; Err = SOCK_INTR_ERRNO (TIMEO); IF (Signal_Pending (CURRENT)) Goto Out; } / * Connection Was Closed By Rst, Timeout, ICMP Error * OR Another Process Disconnected US. * / IF (SK-> State == TCP_CLOSE) Goto Sock_ERROR; / * SK-> Err May Be Not Zero Now, if Recverr Was Ordered by User * And Error Was Received After Socket Entered Established State. * Hence, IT IS Handled Normal The Connect () Return SuccessFully. * / SOCK-> State = SS_CONNECTED; / * Set status is connected * / Err = 0; OUT: Release_sock (sk); Return ERR; SOCK_ERROR: Err = SOCK_ERROR (SK)?: -econnaborted; SOCK-> State = SS_UNCONNECTED; IF (SK-> Prot-> Disconnect (SK, FLAGS)) SOCK-> State = SS_DISCONNECTING; Goto Out; } / * Only when socket is a plug mode, the function will be called * / / * In /net/ipv4/af_inet.c * / Static long inet_wait_for_connect (Struct Sock * SK, Long Timeo) { Declare_WaitQueue (Wait, CURRENT); __set_current_state; Task_Interruptible; Add_Wait_Queue (SK-> Sleep, & Wait); / * Basic Assumption: if someone sets SK-> Err, He _MUST_ * Change State of the socket from tcp_syn_ *. * Connect () Does Not ALLOW TO GET ERROR NOTIFICATIONS * WITHOUT Closing The socket. * / While ((1 < State) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { Release_sock (sk); Timeo = Schedule_timeout (Timeo); / * Enter Sleep * / LOCK_SOCK (SK); IF (Signal_Pending (Current) ||! Timeo) Break; set_current_state; Task_INTERRUptible; } __set_current_state (task_running); REMOVE_WAIT_QUEUE (SK-> Sleep, & Wait); Return Timeo; } ( http://www.fanqiang.com)