bind 系统调用
bind
函数在用户层的函数签名如下:
int bind(int sockfd, const struct sockaddr *myaddr, socklen_t addrlen);
/*
int sockfd:通过socket()函数申请的套接字
const struct sockaddr *myaddr:本地端地址信息结构体,这里使用的是sockaddr_in结构体去转换为sockaddr结构体类型。其中端口号如果绑定为0,则意味着让系统帮我们自动分配一个,一般来说我们会主动分配一个确定的端口号,只有在发起connect时才会让系统替我们自动分配
socklen_t addrlen:结构体的长度,通过sizeof()计算即可
返回值:成功返回0,失败返回-1
*/
对应的系统函数是:
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
return __sys_bind(fd, umyaddr, addrlen);
}
__sys_bind
int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed); // 根据fd找到socket结构体
if (sock) {
// .....
if (!err)
err = sock->ops->bind(sock,
(struct sockaddr *)
&address, addrlen); // 调用的是&inet_stream_ops中的inet_bind函数
}
fput_light(sock->file, fput_needed);
}
return err;
}
inet_bind
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
u32 flags = BIND_WITH_LOCK;
int err;
/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) { // 因为是TCP,因此这里的sk_prot是&tcp_prot,里面有自定义的bind函数,因此此处会略过
return sk->sk_prot->bind(sk, uaddr, addr_len);
}
if (addr_len < sizeof(struct sockaddr_in)) // 传入的结构体长度不对
return -EINVAL;
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
CGROUP_INET4_BIND, &flags);
if (err)
return err;
return __inet_bind(sk, uaddr, addr_len, flags);
}
__inet_bind
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
u32 flags)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
u32 tb_id = RT_TABLE_LOCAL;
int err;
if (addr->sin_family != AF_INET) { // 检查协议族类型,这里一定是IPV4
/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
* only if s_addr is INADDR_ANY.
*/
err = -EAFNOSUPPORT;
if (addr->sin_family != AF_UNSPEC ||
addr->sin_addr.s_addr != htonl(INADDR_ANY))
goto out;
}
tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; // 不知干嘛的。。
chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
chk_addr_ret)) // 检查绑定的IP地址是否是本地IP(INADDR_ANY、本地回环地址)或者是否设置了sysctl_ip_nonlocal_bind属性
goto out;
snum = ntohs(addr->sin_port); // 获取端口号
err = -EACCES;
if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) // 如果设置了端口号(即不为0)那么检查端口号是否小于1024,因为0~1023号的端口只能由root用户进行绑定,或者使用了CAP_NET_BIND_SERVICE属性才能绑定
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
* In the BSD API these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
if (flags & BIND_WITH_LOCK)
lock_sock(sk);
/* Check these errors (active socket, double bind). */
err = -EINVAL;
if (sk->sk_state != TCP_CLOSE || inet->inet_num) // 重复调用了bind
goto out_release_sock;
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
if (snum || !(inet->bind_address_no_port ||
(flags & BIND_FORCE_ADDRESS_NO_PORT))) {
if (sk->sk_prot->get_port(sk, snum)) { // 调用&tcp_prot的inet_csk_get_port函数
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
if (!(flags & BIND_FROM_BPF)) {
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
if (sk->sk_prot->put_port)
sk->sk_prot->put_port(sk);
goto out_release_sock;
}
}
}
if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->inet_sport = htons(inet->inet_num);
inet->inet_daddr = 0;
inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
if (flags & BIND_WITH_LOCK)
release_sock(sk);
out:
return err;
}
inet_csk_get_port
int inet_csk_get_port(struct sock *sk, unsigned short snum) // 这里的snum是端口号,如果是0那么就是由系统自行选择
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; // sk->sk_reuse对应的是SO_REUSEADDR项,这里可以看到只有在TCP不处于TCP_LISTEN状态时,该设置才有效
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int ret = 1, port = snum;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
struct inet_bind_bucket *tb = NULL; // 这里存储的是已经绑定到该端口的所有socket列表
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
if (!port) { // 如果端口号是0
head = inet_csk_find_open_port(sk, &tb, &port); // 根据系统配置的端口范围查询可用的端口,系统配置范围由ip_local_port_range决定,默认范围是[32768, 60999)。如果端口已经被某个socket占用,那么会通过inet_csk_bind_conflict函数处理,其中就会应用SO_REUSEADDR和
if (!head) // 没有查找到可用的端口
return ret;
if (!tb)
goto tb_not_found; // 没有tb结构说明是首次被绑定,那么创建个tb结构先
goto success;
}
// 端口号不为0,那么就要判断该端口号是否已经被人占用了,如果被占用了是否满足SO_REUSEADDR属性而可以顶替掉
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain) // 遍历所有的socket看是否已经有某个socket占用了这个端口号
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port)
goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port, l3mdev); // 创建tb结构
if (!tb)
goto fail_unlock;
tb_found:
if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE) // 如果本sock设置了FORCE_REUSE,那么不管另外一个
goto success;
if ((tb->fastreuse > 0 && reuse) ||
sk_reuseport_match(tb, sk))
goto success;
if (inet_csk_bind_conflict(sk, tb, true, true))
goto fail_unlock;
}
success:
inet_csk_update_fastreuse(tb, sk);
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;
fail_unlock:
spin_unlock_bh(&head->lock);
return ret;
}
系统端口号范围配置如下:
rxsi@VM-20-9-debian:~$ cat /proc/sys/net/ipv4/ip_local_port_range
32768 60999
inet_csk_bind_conflict
当要绑定的端口号已经被其他 socket 占用了,那么通过此函数判断是否可以顶替,根据属性SO_REUSEADDR
和SO_REUSEPORT
决定
static int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb,
bool relax, bool reuseport_ok)
{
struct sock *sk2;
bool reuseport_cb_ok;
bool reuse = sk->sk_reuse; // SO_REUSEADDR属性
bool reuseport = !!sk->sk_reuseport; // SO_REUSEPORT属性
struct sock_reuseport *reuseport_cb;
kuid_t uid = sock_i_uid((struct sock *)sk);
rcu_read_lock();
reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
/* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
rcu_read_unlock();
/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners list belong to the same net - the
* one this bucket belongs to.
*/
sk_for_each_bound(sk2, &tb->owners) {
int bound_dev_if2;
if (sk == sk2)
continue;
bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
if ((!sk->sk_bound_dev_if ||
!bound_dev_if2 ||
sk->sk_bound_dev_if == bound_dev_if2)) {
if (reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) { // linux系统要求两个socket都设置了SO_REUSEADDR属性,且对方的socket不能处于TCP_LISTEN状态
if ((!relax ||
(!reuseport_ok &&
reuseport && sk2->sk_reuseport &&
reuseport_cb_ok &&
(sk2->sk_state == TCP_TIME_WAIT ||
uid_eq(uid, sock_i_uid(sk2))))) &&
inet_rcv_saddr_equal(sk, sk2, true)) // 都设置了SO_REUSEPORT属性,当设置了该属性
break;
} else if (!reuseport_ok ||
!reuseport || !sk2->sk_reuseport ||
!reuseport_cb_ok ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2)))) {
if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
}
}
return sk2 != NULL;
}