Rxsi Blog GameServer Developer

bind源码解析

2022-09-23
Rxsi
TCP

bind 系统调用

bind函数在用户层的函数签名如下:

int bind(int sockfd, const struct sockaddr *myaddr, socklen_t addrlen);
/*
int sockfd:通过socket()函数申请的套接字
const struct sockaddr *myaddr:本地端地址信息结构体,这里使用的是sockaddr_in结构体去转换为sockaddr结构体类型。其中端口号如果绑定为0,则意味着让系统帮我们自动分配一个,一般来说我们会主动分配一个确定的端口号,只有在发起connect时才会让系统替我们自动分配
socklen_t addrlen:结构体的长度,通过sizeof()计算即可

返回值:成功返回0,失败返回-1
*/

对应的系统函数是:

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
    return __sys_bind(fd, umyaddr, addrlen);
}

__sys_bind

int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
    struct socket *sock;
    struct sockaddr_storage address;
    int err, fput_needed;

    sock = sockfd_lookup_light(fd, &err, &fput_needed); // 根据fd找到socket结构体
    if (sock) {
        // .....
            if (!err)
                err = sock->ops->bind(sock,
                              (struct sockaddr *)
                              &address, addrlen); // 调用的是&inet_stream_ops中的inet_bind函数
        }
        fput_light(sock->file, fput_needed);
    }
    return err;
}

inet_bind

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
    struct sock *sk = sock->sk;
    u32 flags = BIND_WITH_LOCK;
    int err;

    /* If the socket has its own bind function then use it. (RAW) */
    if (sk->sk_prot->bind) { // 因为是TCP,因此这里的sk_prot是&tcp_prot,里面有自定义的bind函数,因此此处会略过
        return sk->sk_prot->bind(sk, uaddr, addr_len);
    }
    if (addr_len < sizeof(struct sockaddr_in)) // 传入的结构体长度不对
        return -EINVAL;

    /* BPF prog is run before any checks are done so that if the prog
     * changes context in a wrong way it will be caught.
     */
    err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
                         CGROUP_INET4_BIND, &flags);
    if (err)
        return err;

    return __inet_bind(sk, uaddr, addr_len, flags);
}

__inet_bind

int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
        u32 flags)
{
    struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
    struct inet_sock *inet = inet_sk(sk);
    struct net *net = sock_net(sk);
    unsigned short snum;
    int chk_addr_ret;
    u32 tb_id = RT_TABLE_LOCAL;
    int err;

    if (addr->sin_family != AF_INET) { // 检查协议族类型,这里一定是IPV4
        /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
         * only if s_addr is INADDR_ANY.
         */
        err = -EAFNOSUPPORT;
        if (addr->sin_family != AF_UNSPEC ||
            addr->sin_addr.s_addr != htonl(INADDR_ANY))
            goto out;
    }

    tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; // 不知干嘛的。。
    chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);

    /* Not specified by any standard per-se, however it breaks too
     * many applications when removed.  It is unfortunate since
     * allowing applications to make a non-local bind solves
     * several problems with systems using dynamic addressing.
     * (ie. your servers still start up even if your ISDN link
     *  is temporarily down)
     */
    err = -EADDRNOTAVAIL;
    if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
                                     chk_addr_ret)) // 检查绑定的IP地址是否是本地IP(INADDR_ANY、本地回环地址)或者是否设置了sysctl_ip_nonlocal_bind属性
        goto out;

    snum = ntohs(addr->sin_port); // 获取端口号
    err = -EACCES;
    if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
        snum && inet_port_requires_bind_service(net, snum) &&
        !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) // 如果设置了端口号(即不为0)那么检查端口号是否小于1024,因为0~1023号的端口只能由root用户进行绑定,或者使用了CAP_NET_BIND_SERVICE属性才能绑定
        goto out;

    /*      We keep a pair of addresses. rcv_saddr is the one
     *      used by hash lookups, and saddr is used for transmit.
     *
     *      In the BSD API these are the same except where it
     *      would be illegal to use them (multicast/broadcast) in
     *      which case the sending device address is used.
     */
    if (flags & BIND_WITH_LOCK)
        lock_sock(sk);

    /* Check these errors (active socket, double bind). */
    err = -EINVAL;
    if (sk->sk_state != TCP_CLOSE || inet->inet_num) // 重复调用了bind
        goto out_release_sock;

    inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
    if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
        inet->inet_saddr = 0;  /* Use device */

    /* Make sure we are allowed to bind here. */
    if (snum || !(inet->bind_address_no_port ||
              (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
        if (sk->sk_prot->get_port(sk, snum)) { // 调用&tcp_prot的inet_csk_get_port函数
            inet->inet_saddr = inet->inet_rcv_saddr = 0;
            err = -EADDRINUSE;
            goto out_release_sock;
        }
        if (!(flags & BIND_FROM_BPF)) {
            err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
            if (err) {
                inet->inet_saddr = inet->inet_rcv_saddr = 0;
                if (sk->sk_prot->put_port)
                    sk->sk_prot->put_port(sk);
                goto out_release_sock;
            }
        }
    }

    if (inet->inet_rcv_saddr)
        sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
    if (snum)
        sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
    inet->inet_sport = htons(inet->inet_num);
    inet->inet_daddr = 0;
    inet->inet_dport = 0;
    sk_dst_reset(sk);
    err = 0;
out_release_sock:
    if (flags & BIND_WITH_LOCK)
        release_sock(sk);
out:
    return err;
}

inet_csk_get_port

int inet_csk_get_port(struct sock *sk, unsigned short snum) // 这里的snum是端口号,如果是0那么就是由系统自行选择
{
    bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; // sk->sk_reuse对应的是SO_REUSEADDR项,这里可以看到只有在TCP不处于TCP_LISTEN状态时,该设置才有效
    struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
    int ret = 1, port = snum;
    struct inet_bind_hashbucket *head;
    struct net *net = sock_net(sk);
    struct inet_bind_bucket *tb = NULL; // 这里存储的是已经绑定到该端口的所有socket列表
    int l3mdev;

    l3mdev = inet_sk_bound_l3mdev(sk);

    if (!port) { // 如果端口号是0
        head = inet_csk_find_open_port(sk, &tb, &port); // 根据系统配置的端口范围查询可用的端口,系统配置范围由ip_local_port_range决定,默认范围是[32768, 60999)。如果端口已经被某个socket占用,那么会通过inet_csk_bind_conflict函数处理,其中就会应用SO_REUSEADDR和
        if (!head) // 没有查找到可用的端口
            return ret;
        if (!tb)
            goto tb_not_found; // 没有tb结构说明是首次被绑定,那么创建个tb结构先
        goto success;
    }
    // 端口号不为0,那么就要判断该端口号是否已经被人占用了,如果被占用了是否满足SO_REUSEADDR属性而可以顶替掉
    head = &hinfo->bhash[inet_bhashfn(net, port,
                      hinfo->bhash_size)];
    spin_lock_bh(&head->lock);
    inet_bind_bucket_for_each(tb, &head->chain) // 遍历所有的socket看是否已经有某个socket占用了这个端口号
        if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
            tb->port == port)
            goto tb_found;
tb_not_found:
    tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
                     net, head, port, l3mdev); // 创建tb结构
    if (!tb)
        goto fail_unlock;
tb_found:
    if (!hlist_empty(&tb->owners)) {
        if (sk->sk_reuse == SK_FORCE_REUSE) // 如果本sock设置了FORCE_REUSE,那么不管另外一个
            goto success;

        if ((tb->fastreuse > 0 && reuse) ||
            sk_reuseport_match(tb, sk))
            goto success;
        if (inet_csk_bind_conflict(sk, tb, true, true))
            goto fail_unlock;
    }
success:
    inet_csk_update_fastreuse(tb, sk);

    if (!inet_csk(sk)->icsk_bind_hash)
        inet_bind_hash(sk, tb, port);
    WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
    ret = 0;

fail_unlock:
    spin_unlock_bh(&head->lock);
    return ret;
}

系统端口号范围配置如下:

rxsi@VM-20-9-debian:~$ cat /proc/sys/net/ipv4/ip_local_port_range 
32768   60999

inet_csk_bind_conflict

当要绑定的端口号已经被其他 socket 占用了,那么通过此函数判断是否可以顶替,根据属性SO_REUSEADDRSO_REUSEPORT决定

static int inet_csk_bind_conflict(const struct sock *sk,
                  const struct inet_bind_bucket *tb,
                  bool relax, bool reuseport_ok)
{
    struct sock *sk2;
    bool reuseport_cb_ok;
    bool reuse = sk->sk_reuse; // SO_REUSEADDR属性
    bool reuseport = !!sk->sk_reuseport; // SO_REUSEPORT属性
    struct sock_reuseport *reuseport_cb;
    kuid_t uid = sock_i_uid((struct sock *)sk);

    rcu_read_lock();
    reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
    /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
    reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
    rcu_read_unlock();

    /*
     * Unlike other sk lookup places we do not check
     * for sk_net here, since _all_ the socks listed
     * in tb->owners list belong to the same net - the
     * one this bucket belongs to.
     */

    sk_for_each_bound(sk2, &tb->owners) {
        int bound_dev_if2;

        if (sk == sk2)
            continue;
        bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
        if ((!sk->sk_bound_dev_if ||
             !bound_dev_if2 ||
             sk->sk_bound_dev_if == bound_dev_if2)) {
            if (reuse && sk2->sk_reuse &&
                sk2->sk_state != TCP_LISTEN) { // linux系统要求两个socket都设置了SO_REUSEADDR属性,且对方的socket不能处于TCP_LISTEN状态
                if ((!relax ||
                     (!reuseport_ok &&
                      reuseport && sk2->sk_reuseport &&
                      reuseport_cb_ok &&
                      (sk2->sk_state == TCP_TIME_WAIT ||
                       uid_eq(uid, sock_i_uid(sk2))))) &&
                    inet_rcv_saddr_equal(sk, sk2, true)) // 都设置了SO_REUSEPORT属性,当设置了该属性
                    break;
            } else if (!reuseport_ok ||
                   !reuseport || !sk2->sk_reuseport ||
                   !reuseport_cb_ok ||
                   (sk2->sk_state != TCP_TIME_WAIT &&
                    !uid_eq(uid, sock_i_uid(sk2)))) {
                if (inet_rcv_saddr_equal(sk, sk2, true))
                    break;
            }
        }
    }
    return sk2 != NULL;
}


Similar Posts

Comments