Friday, November 30, 2007

Linux的硬件地址解析过程 - collide的专栏

Linux的硬件地址解析过程

1) 在网络接口设备的硬件层之间能够直接进行包交换的设备构成了一个局域网,
局域网中的每一设备
具有唯一的硬件地址. 对TCPIP协议来说, 局域网中的每一设备又具有唯一的IP地址.
当IP包要从某一
设备发向局域网中具有另一IP地址的设备时, 信源设备必须获得信宿设备的硬件地址,
这就需要硬件
地址解析.arp协议是根据设备的IP地址获取其硬件地址的方法.
信源设备向局域网广播自已地址解析
请求, 局域网中其余设备都收到该请求, 具有所请求IP地址的设备向信源设备作出应答,
提供它的硬
件地址. 由于arp请求的广播特性, 某一设备可以对不是自已IP地址的请求作出应答,
这就是arp代理.

2) 在Linux内核中, 将能与自已在硬件层直接通信的外部主机的网络接口设备称为"邻居",
用neighbour
结构描述, 它包含设备的硬件地址信息.
系统中所有的IP包都通过路由所绑定的邻居发送到接口设备上.
邻居由邻居表(arp_tbl)来索引, 用邻居的IP地址可查询邻居表中某个设备的邻居.

3) 当邻居接收到要发送的IP包时, 如果邻居的硬件地址还未解析,
则将发送包暂时缓冲在arp_queue队
列中,然后发送地址解析请求, 这时的状态为未完成状态(NUD_INCOMPLETE).
如果1秒内没收到外部设备
应答, 邻居将重发arp请求, 如果重发达到3次, 则解析失败, 邻居为失败状态(NUD_FAILED).
当收到正
确应答, 邻居进入连接状态(NUD_REACHABLE),
这时arp_queue中发送包将被创建帧头后发送到设备上.

4) 邻居的IP地址与硬件地址的关系并不是固定的, 系统在接收来自邻居的IP包时,
会及时地证实(confirm)
邻居的IP地址与硬件地址的映射关系. 同时,
邻居表每60秒周期性地扫描邻居(neigh_periodic_timer),
一方面从表中清除那些解析失败和长时间(60秒)未被使用的邻居,
另一方面识别那些证实时间已超时的
邻居, 将它们从连接状态变为过期状态(NUD_STALE). 当邻居在过期状态发送IP包时,
就进入延迟状态
(NUD_DELAY), 如果在延迟状态5秒后邻居的硬件地址还是未被证实, 邻居则发送arp请求,
这时进入探测
状态(NUD_PROBE). 在探测状态, IP包并不进行缓冲, 仍旧使用过期的邻居地址发送,
如果探测失败,
邻居进入失败状态.

5) 为了缩短IP包到设备的传输路径, 在邻居结构上还引入了帧头缓冲结构(hh_cache).
如果邻居建立了
帧头缓冲, IP包将通过帧头缓冲的输出发送出去. 当邻居处于连接状态时, 帧头缓冲输出直接指向
dev_queue_xmit(), 当处于过期状态时, 帧头缓冲输出切换为邻居的输出口, 对以太网设备来说,
邻居的
输出口指向neigh_resolve_output(),
neigh_connect()和neigh_suspect()两个函数用来进行这种切换.

6) 当系统对外部设备arp的请求应答时, 系统将在arp_tbl中创建该外部设备的邻居,
刷新为过期状态.
当收到对其它设备的地址解析请求时, 系统将源设备的邻居刷新为过期状态. 当收到单发给自已,
但目
的IP地址不是自已主机地址的arp请求时, 如果设备允许转发并且该IP在代理表有定义,
则将它们缓冲到
proxy_queue队列, 等待一段随机的时间作出应答, 防止目标设备拥塞,
向对方提供的是自已的设备地址.

这一部分有点复杂, 还有很多细节尚未搞清, 只能写这么多, 欢迎大家指点.
现在自已的问题是: 在什么情况下必须使用arp代理?

; net/ipv4/ip_output.c:

static inline int ip_finish_output2(struct sk_buff *skb) IP包的发送出口
{
struct dst_entry *dst = skb->dst; 取IP包的路由结构
struct hh_cache *hh = dst->hh; 取路由的帧头缓冲

#ifdef CONFIG_NETFILTER_DEBUG
nf_debug_ip_finish_output2(skb);
#endif /*CONFIG_NETFILTER_DEBUG*/

if (hh) { 如果路由帧头缓冲非空
read_lock_bh(&hh->hh_lock);
memcpy(skb->data - 16, hh->hh_data, 16); 创建IP包的硬件帧头
read_unlock_bh(&hh->hh_lock);
skb_push(skb, hh->hh_len);
return hh->hh_output(skb); 通过帧头缓冲发出
} else if (dst->neighbour)
return dst->neighbour->output(skb); 通过邻居出口发出

printk(KERN_DEBUG "khm\n");
kfree_skb(skb);
return -EINVAL;
}

; net/ipv4/arp.c, core/neighbour.c:

#define NUD_INCOMPLETE 0x01
#define NUD_REACHABLE 0x02
#define NUD_STALE 0x04
#define NUD_DELAY 0x08
#define NUD_PROBE 0x10
#define NUD_FAILED 0x20

/* Dummy states */
#define NUD_NOARP 0x40
#define NUD_PERMANENT 0x80
#define NUD_NONE 0x00

/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change
and make no address resolution or NUD.
NUD_PERMANENT is also cannot be deleted by garbage collectors.
*/
#define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_DELAY|NUD_PROBE)
#define
NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY)
#define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE)

#define NEIGH_HASHMASK 0x1F
#define PNEIGH_HASHMASK 0xF

struct neigh_table 网络邻居表
{
struct neigh_table *next;
int family;
int entry_size;
int key_len;
__u32 (*hash)(const void *pkey, const struct net_device *);
int (*constructor)(struct neighbour *);
int (*pconstructor)(struct pneigh_entry *);
void (*pdestructor)(struct pneigh_entry *);
void (*proxy_redo)(struct sk_buff *skb);
char *id;
struct neigh_parms parms;
/* HACK. gc_* shoul follow parms without a gap! */
int gc_interval; (60秒)
int gc_thresh1; (128)
int gc_thresh2; (512)
int gc_thresh3; (1024)
unsigned long last_flush;
struct timer_list gc_timer;
struct timer_list proxy_timer;
struct sk_buff_head proxy_queue;
int entries;
rwlock_t lock;
unsigned long last_rand;
struct neigh_parms *parms_list;
kmem_cache_t *kmem_cachep;
struct tasklet_struct gc_task;
struct neigh_statistics stats;
struct neighbour *hash_buckets[NEIGH_HASHMASK+1];
struct pneigh_entry *phash_buckets[PNEIGH_HASHMASK+1];
};
struct neigh_parms 邻居参数
{
struct neigh_parms *next;
int (*neigh_setup)(struct neighbour *);
struct neigh_table *tbl;
int entries;
void *priv;

void *sysctl_table;

int base_reachable_time; (30秒)
int retrans_time; (1秒)
int gc_staletime; (60秒)
int reachable_time; (30秒左右)
int delay_probe_time; (5秒)

int queue_len; (3)
int ucast_probes; (3)
int app_probes; (0)
int mcast_probes; (3)
int anycast_delay; (1秒)
int proxy_delay; (0.8秒)
int proxy_qlen; (64)
int locktime; (1秒)
};
struct neighbour 网络邻居结构
{
struct neighbour *next;
struct neigh_table *tbl;
struct neigh_parms *parms;
struct net_device *dev;
unsigned long used;
unsigned long confirmed;
unsigned long updated;
__u8 flags;
__u8 nud_state;
__u8 type;
__u8 dead;
atomic_t probes;
rwlock_t lock;
unsigned char ha[(MAX_ADDR_LEN+sizeof(unsigned long)-1)&~(sizeof(unsigned long)-1)];
struct hh_cache *hh;
atomic_t refcnt;
int (*output)(struct sk_buff *skb);
struct sk_buff_head arp_queue;
struct timer_list timer;
struct neigh_ops *ops;
u8 primary_key[0];
};
struct hh_cache 帧头缓冲结构
{
struct hh_cache *hh_next; /* Next entry */
atomic_t hh_refcnt; /* number of users */
unsigned short hh_type; /* protocol identifier, f.e ETH_P_IP */
int hh_len; /* length of header */
int (*hh_output)(struct sk_buff *skb);
rwlock_t hh_lock;
/* cached hardware header; allow for machine alignment needs. */
unsigned long hh_data[16/sizeof(unsigned long)];
};


struct neigh_table arp_tbl = arp网络邻居表, 用于TCPIP协议中的网络邻居信息索引
{
NULL,
AF_INET,
sizeof(struct neighbour) + 4,
4,
arp_hash,
arp_constructor,
NULL,
NULL,
parp_redo,
"arp_cache",
{ NULL, NULL, &arp_tbl, 0, NULL, NULL,
30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ,
3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ },30*HZ, 128, 512, 1024,
};

void __init arp_init (void) 地址解析模块初始化
{
neigh_table_init(&arp_tbl); 初始化arp协议网络邻居解析表

dev_add_pack(&arp_packet_type); 注册地址解析包接收器

proc_net_create ("arp", 0, arp_get_info);

#ifdef CONFIG_SYSCTL
neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4");
#endif
}

static struct neigh_table *neigh_tables; 所有的网络邻居解析表链表

void neigh_table_init(struct neigh_table *tbl)
{
unsigned long now = jiffies;

tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time);
if (tbl->kmem_cachep == NULL)
tbl->kmem_cachep = kmem_cache_create(tbl->id,
(tbl->entry_size+15)&~15,
0, SLAB_HWCACHE_ALIGN,
NULL, NULL); 建立网络邻居信息结构内存分配器

#ifdef CONFIG_SMP
tasklet_init(&tbl->gc_task, SMP_TIMER_NAME(neigh_periodic_timer), (unsigned
long)tbl);
#endif
init_timer(&tbl->gc_timer);
tbl->lock = RW_LOCK_UNLOCKED;
tbl->gc_timer.data = (unsigned long)tbl;
tbl->gc_timer.function = neigh_periodic_timer;
tbl->gc_timer.expires = now + tbl->gc_interval + tbl->parms.reachable_time;
add_timer(&tbl->gc_timer);

init_timer(&tbl->proxy_timer);
tbl->proxy_timer.data = (unsigned long)tbl;
tbl->proxy_timer.function = neigh_proxy_process;
skb_queue_head_init(&tbl->proxy_queue);

tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time*20;
write_lock(&neigh_tbl_lock);
tbl->next = neigh_tables;
neigh_tables = tbl;
write_unlock(&neigh_tbl_lock);
}
unsigned long neigh_rand_reach_time(unsigned long base) 取在(base/2)左右的随机数
{
return (net_random() % base) + (base>>1);
}

static void SMP_TIMER_NAME(neigh_periodic_timer)(unsigned long arg)
{
struct neigh_table *tbl = (struct neigh_table*)arg;
unsigned long now = jiffies;
int i;


write_lock(&tbl->lock);

/*
* periodicly recompute ReachableTime from random function
*/

if (now - tbl->last_rand > 300*HZ) {
struct neigh_parms *p;
tbl->last_rand = now;
for (p=&tbl->parms; p; p = p->next)
p->reachable_time = neigh_rand_reach_time(p->base_reachable_time);
}

for (i=0; i <= NEIGH_HASHMASK; i++) { 扫描所有的网络邻居
struct neighbour *n, **np;

np = &tbl->hash_buckets
;
while ((n = *np) != NULL) {
unsigned state;

write_lock(&n->lock);

state = n->nud_state;
if (state&(NUD_PERMANENT|NUD_IN_TIMER)) { 对于不可释放或正在解析的邻居
write_unlock(&n->lock);
goto next_elt;
}

if ((long)(n->used - n->confirmed) < 0)
n->used = n->confirmed;

if (atomic_read(&n->refcnt) == 1 &&
(state == NUD_FAILED || now - n->used > n->parms->gc_staletime)) {
*np = n->next; 释放那些解析失败和引用时间超过(60)秒的邻居
n->dead = 1;
write_unlock(&n->lock);
neigh_release(n);
continue;
}

if (n->nud_state&NUD_REACHABLE &&
now - n->confirmed > n->parms->reachable_time) {
n->nud_state = NUD_STALE; 如果解析时间大于可达超时, 则邻居的地址失效
neigh_suspect(n);
}
write_unlock(&n->lock);

next_elt:
np = &n->next;
}
}

mod_timer(&tbl->gc_timer, now + tbl->gc_interval);
write_unlock(&tbl->lock);
}

int arp_bind_neighbour(struct dst_entry *dst) 将邻居绑定给指定路由
{
struct net_device *dev = dst->dev;
struct neighbour *n = dst->neighbour;

if (dev == NULL)
return -EINVAL;
if (n == NULL) {
u32 nexthop = ((struct rtable*)dst)->rt_gateway; 取网关地址
if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
nexthop = 0;
n = __neigh_lookup_errno(
#ifdef CONFIG_ATM_CLIP
dev->type == ARPHRD_ATM ? &clip_tbl :
#endif
&arp_tbl, &nexthop, dev); 在arp_tbl中查询目标地址nexthop的设备邻居
if (IS_ERR(n))
return PTR_ERR(n);
dst->neighbour = n;
}
return 0;
}
static inline struct neighbour *
__neigh_lookup_errno(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n = neigh_lookup(tbl, pkey, dev);

if (n)
return n;

return neigh_create(tbl, pkey, dev); 在邻居表中创建新的设备邻居
}
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n;
u32 hash_val;
int key_len = tbl->key_len;

hash_val = tbl->hash(pkey, dev); 计算邻居的索引键

read_lock_bh(&tbl->lock);
for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
if (dev == n->dev &&
memcmp(n->primary_key, pkey, key_len) == 0) {
neigh_hold(n);
break;
}
}
read_unlock_bh(&tbl->lock);
return n;
}
static u32 arp_hash(const void *pkey, const struct net_device *dev)
{
u32 hash_val;

hash_val = *(u32*)pkey;
hash_val ^= (hash_val>>16);
hash_val ^= hash_val>>8;
hash_val ^= hash_val>>3;
hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;

return hash_val;
}
struct neighbour * neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n, *n1;
u32 hash_val;
int key_len = tbl->key_len;
int error;

n = neigh_alloc(tbl); 为邻居表tbl创建新的邻居
if (n == NULL)
return ERR_PTR(-ENOBUFS);

memcpy(n->primary_key, pkey, key_len); 拷贝邻居的键值
n->dev = dev;
dev_hold(dev);

/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
neigh_release(n);
return ERR_PTR(error);
}

/* Device specific setup. */
if (n->parms && n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
neigh_release(n);
return ERR_PTR(error);
}

n->confirmed = jiffies - (n->parms->base_reachable_time<<1);

hash_val = tbl->hash(pkey, dev);

write_lock_bh(&tbl->lock);
for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
if (dev == n1->dev &&
memcmp(n1->primary_key, pkey, key_len) == 0) {
neigh_hold(n1); 如果邻居表中有重键,则释放新建邻居,返回已有的邻居
write_unlock_bh(&tbl->lock);
neigh_release(n);
return n1;
}
}

n->next = tbl->hash_buckets[hash_val];
tbl->hash_buckets[hash_val] = n; 插入索引链
n->dead = 0;
neigh_hold(n);
write_unlock_bh(&tbl->lock);
NEIGH_PRINTK2("neigh %p is created.\n", n);
return n;
}

static struct neighbour *neigh_alloc(struct neigh_table *tbl) 分配邻居结构
{
struct neighbour *n;
unsigned long now = jiffies;

if (tbl->entries > tbl->gc_thresh3 || 如果邻居表的单元数目超过了(1024)
(tbl->entries > tbl->gc_thresh2 && 或者超过了(512)并且
now - tbl->last_flush > 5*HZ)) { 与上次刷新间隔超过5秒
if (neigh_forced_gc(tbl) == 0 && 释放过时的邻居
tbl->entries > tbl->gc_thresh3)
return NULL;
}

n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC);
if (n == NULL)
return NULL;

memset(n, 0, tbl->entry_size);

skb_queue_head_init(&n->arp_queue); 当邻居的地址尚未解析时,
发向该邻居的包缓冲在该队列中
n->lock = RW_LOCK_UNLOCKED;
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole; 邻居的IP包注入口
n->parms = &tbl->parms; 继承邻居表的参数
init_timer(&n->timer);
n->timer.function = neigh_timer_handler; 接收超时临视器
n->timer.data = (unsigned long)n;
tbl->stats.allocs++;
neigh_glbl_allocs++;
tbl->entries++;
n->tbl = tbl;
atomic_set(&n->refcnt, 1);
n->dead = 1;
return n;
}
static int neigh_blackhole(struct sk_buff *skb)
{
kfree_skb(skb);
return -ENETDOWN;
}
static int neigh_forced_gc(struct neigh_table *tbl) 强制释放邻居
{
int shrunk = 0;
int i;

for (i=0; i<=NEIGH_HASHMASK; i++) {
struct neighbour *n, **np;

np = &tbl->hash_buckets
;
write_lock_bh(&tbl->lock);
while ((n = *np) != NULL) {
/* Neighbour record may be discarded if:
- nobody refers to it.
- it is not premanent
- (NEW and probably wrong)
INCOMPLETE entries are kept at least for
n->parms->retrans_time, otherwise we could
flood network with resolution requests.
It is not clear, what is better table overflow
or flooding.
*/
write_lock(&n->lock);
if (atomic_read(&n->refcnt) == 1 &&
!(n->nud_state&NUD_PERMANENT) &&
(n->nud_state != NUD_INCOMPLETE ||
jiffies - n->used > n->parms->retrans_time)) {
*np = n->next;
n->dead = 1;
shrunk = 1;
write_unlock(&n->lock);
neigh_release(n);
continue;
}
write_unlock(&n->lock);
np = &n->next;
}
write_unlock_bh(&tbl->lock);
}

tbl->last_flush = jiffies;
return shrunk;
}
static inline void neigh_release(struct neighbour *neigh)
{
if (atomic_dec_and_test(&neigh->refcnt))
neigh_destroy(neigh);
}
static int neigh_del_timer(struct neighbour *n)
{
if (n->nud_state & NUD_IN_TIMER) {
if (del_timer(&n->timer)) {
neigh_release(n);
return 1;
}
}
return 0;
}
void neigh_destroy(struct neighbour *neigh)
{
struct hh_cache *hh;

if (!neigh->dead) {
printk("Destroying alive neighbour %p from %08lx\n", neigh,
*(((unsigned long*)&neigh)-1));
return;
}

if (neigh_del_timer(neigh))
printk("Impossible event.\n");

while ((hh = neigh->hh) != NULL) {
neigh->hh = hh->hh_next;
hh->hh_next = NULL;
write_lock_bh(&hh->hh_lock);
hh->hh_output = neigh_blackhole;
write_unlock_bh(&hh->hh_lock);
if (atomic_dec_and_test(&hh->hh_refcnt))
kfree(hh);
} 清除网络邻居的帧头缓冲

if (neigh->ops && neigh->ops->destructor)
(neigh->ops->destructor)(neigh);

skb_queue_purge(&neigh->arp_queue); 清除邻居的IP包缓冲队列

dev_put(neigh->dev);

NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);

neigh_glbl_allocs--;
neigh->tbl->entries--;
kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
}

/* As fast as possible without hh cache */

static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
return p->ucast_probes + p->app_probes + p->mcast_probes;
}

static void neigh_timer_handler(unsigned long arg)
{
unsigned long now = jiffies;
struct neighbour *neigh = (struct neighbour*)arg;
unsigned state;
int notify = 0;

write_lock(&neigh->lock);

state = neigh->nud_state;

if (!(state&NUD_IN_TIMER)) {
#ifndef CONFIG_SMP
printk("neigh: timer & !nud_in_timer\n");
#endif
goto out;
}

if ((state&NUD_VALID) &&
now - neigh->confirmed <>parms->reachable_time) {
neigh->nud_state = NUD_REACHABLE;
NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
neigh_connect(neigh);
goto out;
}
if (state == NUD_DELAY) {
NEIGH_PRINTK2("neigh %p is probed.\n", neigh);
neigh->nud_state = NUD_PROBE;
atomic_set(&neigh->probes, 0);
}

if (atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
struct sk_buff *skb;

neigh->nud_state = NUD_FAILED;
notify = 1;
neigh->tbl->stats.res_failed++;
NEIGH_PRINTK2("neigh %p is failed.\n", neigh);

/* It is very thin place. report_unreachable is very complicated
routine. Particularly, it can hit the same neighbour entry!

So that, we try to be accurate and avoid dead loop. --ANK
*/
while(neigh->nud_state==NUD_FAILED && (skb=__skb_dequeue(&neigh->arp_queue)) !=
NULL) {
write_unlock(&neigh->lock);
neigh->ops->error_report(neigh, skb);
write_lock(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
goto out;
}

neigh->timer.expires = now + neigh->parms->retrans_time;
add_timer(&neigh->timer);
write_unlock(&neigh->lock);

neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue)); 重发arp请求
atomic_inc(&neigh->probes);
return;

out:
write_unlock(&neigh->lock);
#ifdef CONFIG_ARPD
if (notify && neigh->parms->app_probes)
neigh_app_notify(neigh);
#endif
neigh_release(neigh);
}
static int arp_constructor(struct neighbour *neigh)
{
u32 addr = *(u32*)neigh->primary_key; 主键为邻居的IP地址
struct net_device *dev = neigh->dev; 取邻居所在的主机设备
struct in_device *in_dev = in_dev_get(dev);

if (in_dev == NULL)
return -EINVAL;

neigh->type = inet_addr_type(addr); 取邻居的IP地址类型
if (in_dev->arp_parms)
neigh->parms = in_dev->arp_parms; 继承设备的arp参数

in_dev_put(in_dev);

if (dev->hard_header == NULL) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops; 安装直达操作表
neigh->output = neigh->ops->queue_xmit; 向邻居的输出函数为(dev_queue_xmit)
} else {
/* Good devices (checked by reading texts, but only Ethernet is
tested)

ARPHRD_ETHER: (ethernet, apfddi)
ARPHRD_FDDI: (fddi)
ARPHRD_IEEE802: (tr)
ARPHRD_METRICOM: (strip)
ARPHRD_ARCNET:
etc. etc. etc.

ARPHRD_IPDDP will also work, if author repairs it.
I did not it, because this driver does not work even
in old paradigm.
*/

#if 1
/* So... these "amateur" devices are hopeless.
The only thing, that I can say now:
It is very sad that we need to keep ugly obsolete
code to make them happy.

They should be moved to more reasonable state, now
they use rebuild_header INSTEAD OF hard_start_xmit!!!
Besides that, they are sort of out of date
(a lot of redundant clones/copies, useless in 2.1),
I wonder why people believe that they work.
*/
switch (dev->type) {
default:
break;
case ARPHRD_ROSE:
#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
case ARPHRD_AX25:
#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
case ARPHRD_NETROM:
#endif
neigh->ops = &arp_broken_ops;
neigh->output = neigh->ops->output;
return 0;
#endif
;}
#endif
if (neigh->type == RTN_MULTICAST) {
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
} else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
} else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
if (dev->hard_header_cache)
neigh->ops = &arp_hh_ops; 安装邻居的帧头缓冲操作
else
neigh->ops = &arp_generic_ops;
if (neigh->nud_state&NUD_VALID) 地址是否已解析
neigh->output = neigh->ops->connected_output; 直接发送
else
neigh->output = neigh->ops->output; 先解析再发送
}
return 0;
}

struct neigh_ops
{
int family;
void (*destructor)(struct neighbour *);
void (*solicit)(struct neighbour *, struct sk_buff*);
void (*error_report)(struct neighbour *, struct sk_buff*);
int (*output)(struct sk_buff*);
int (*connected_output)(struct sk_buff*);
int (*hh_output)(struct sk_buff*);
int (*queue_xmit)(struct sk_buff*);
};
static struct neigh_ops arp_direct_ops = 无解析发送
{
AF_INET,
NULL,
NULL,
NULL,
dev_queue_xmit,
dev_queue_xmit,
dev_queue_xmit,
dev_queue_xmit
};
struct neigh_ops arp_broken_ops = 不完全解析
{
AF_INET,
NULL,
arp_solicit,
arp_error_report,
neigh_compat_output,
neigh_compat_output,
dev_queue_xmit,
dev_queue_xmit,
};
int neigh_compat_output(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;

__skb_pull(skb, skb->nh.raw - skb->data);

if (dev->hard_header &&
dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 &&
dev->rebuild_header(skb))
return 0;

return dev_queue_xmit(skb);
}
static struct neigh_ops arp_hh_ops = 通帧头缓冲解析
{
AF_INET,
NULL,
arp_solicit,
arp_error_report,
neigh_resolve_output,
neigh_resolve_output,
dev_queue_xmit,
dev_queue_xmit
};
int neigh_resolve_output(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
struct neighbour *neigh;

if (!dst || !(neigh = dst->neighbour))
goto discard;

__skb_pull(skb, skb->nh.raw - skb->data);

if (neigh_event_send(neigh, skb) == 0) {
int err;
struct net_device *dev = neigh->dev;
if (dev->hard_header_cache && dst->hh == NULL) {
write_lock_bh(&neigh->lock);
if (dst->hh == NULL)
neigh_hh_init(neigh, dst, dst->ops->protocol);
err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
write_unlock_bh(&neigh->lock);
} else {
read_lock_bh(&neigh->lock);
err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
read_unlock_bh(&neigh->lock);
}
if (err >= 0)
return neigh->ops->queue_xmit(skb);
kfree_skb(skb);
return -EINVAL;
}
return 0;

discard:
NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour :
NULL);
kfree_skb(skb);
return -EINVAL;
}
static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protocol)
{
struct hh_cache *hh = NULL;
struct net_device *dev = dst->dev;

for (hh=n->hh; hh; hh = hh->hh_next)
if (hh->hh_type == protocol)
break;

if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
memset(hh, 0, sizeof(struct hh_cache));
hh->hh_lock = RW_LOCK_UNLOCKED;
hh->hh_type = protocol;
atomic_set(&hh->hh_refcnt, 0);
hh->hh_next = NULL;
if (dev->hard_header_cache(n, hh)) {
kfree(hh);
hh = NULL;
} else {
atomic_inc(&hh->hh_refcnt);
hh->hh_next = n->hh;
n->hh = hh;
if (n->nud_state&NUD_CONNECTED) 如果设备地址有效
hh->hh_output = n->ops->hh_output; 继承邻居的帧头缓冲输出
else
hh->hh_output = n->ops->output; 继承邻居的输出
}
}
if (hh) {
atomic_inc(&hh->hh_refcnt);
dst->hh = hh;
}
}

static struct neigh_ops arp_generic_ops =
{
AF_INET,
NULL,
arp_solicit,
arp_error_report,
neigh_resolve_output,
neigh_connected_output,
dev_queue_xmit,
dev_queue_xmit
};
int neigh_connected_output(struct sk_buff *skb)
{
int err;
struct dst_entry *dst = skb->dst;
struct neighbour *neigh = dst->neighbour;
struct net_device *dev = neigh->dev;

__skb_pull(skb, skb->nh.raw - skb->data);

read_lock_bh(&neigh->lock);
err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
read_unlock_bh(&neigh->lock);
if (err >= 0)
return neigh->ops->queue_xmit(skb);
kfree_skb(skb);
return -EINVAL;
}

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
neigh->used = jiffies;
if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
return __neigh_event_send(neigh, skb);
return 0;
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
write_lock_bh(&neigh->lock);
if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) {
if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) {
if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
atomic_set(&neigh->probes, neigh->parms->ucast_probes);
neigh->nud_state = NUD_INCOMPLETE;
neigh_hold(neigh);
neigh->timer.expires = jiffies + neigh->parms->retrans_time; 下一重发时间
add_timer(&neigh->timer);
write_unlock_bh(&neigh->lock);
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
write_lock_bh(&neigh->lock);
} else {
neigh->nud_state = NUD_FAILED;
write_unlock_bh(&neigh->lock);

if (skb)
kfree_skb(skb);
return 1;
}
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
if (skb_queue_len(&neigh->arp_queue) >= neigh->parms->queue_len) {
struct sk_buff *buff;
buff = neigh->arp_queue.prev;
__skb_unlink(buff, &neigh->arp_queue);
kfree_skb(buff);
}
__skb_queue_head(&neigh->arp_queue, skb);
}
write_unlock_bh(&neigh->lock);
return 1;
}
if (neigh->nud_state == NUD_STALE) {
NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
neigh_hold(neigh);
neigh->nud_state = NUD_DELAY;
neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
add_timer(&neigh->timer);
}
}
write_unlock_bh(&neigh->lock);
return 0;
}
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) arp请求
{
u32 saddr;
u8 *dst_ha = NULL;
struct net_device *dev = neigh->dev;
u32 target = *(u32*)neigh->primary_key; 取邻居的IP地址
int probes = atomic_read(&neigh->probes);

if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
saddr = skb->nh.iph->saddr;
else
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);

if ((probes -= neigh->parms->ucast_probes) < 0) {
if (!(neigh->nud_state&NUD_VALID))
printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
dst_ha = neigh->ha;
read_lock_bh(&neigh->lock);
} else if ((probes -= neigh->parms->app_probes) < 0) {
#ifdef CONFIG_ARPD
neigh_app_ns(neigh);
#endif
return;
}

arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
dst_ha, dev->dev_addr, NULL);
if (dst_ha)
read_unlock_bh(&neigh->lock);
}
struct arphdr
{
unsigned short ar_hrd; 硬件地址格式
unsigned short ar_pro; 协议地址格式
unsigned char ar_hln; 硬件地址长度
unsigned char ar_pln; 协议地址长度
unsigned short ar_op; 命令代码

#if 0
/*
* Ethernet looks like this : This bit is variable sized however...
*/
unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */
unsigned char ar_sip[4]; /* sender IP address */
unsigned char ar_tha[ETH_ALEN]; /* target hardware address */
unsigned char ar_tip[4]; /* target IP address */
#endif

};
void arp_send(int type, int ptype, u32 dest_ip,
struct net_device *dev, u32 src_ip,
unsigned char *dest_hw, unsigned char *src_hw,
unsigned char *target_hw)
{
struct sk_buff *skb;
struct arphdr *arp;
unsigned char *arp_ptr;

/*
* No arp on this interface.
*/

if (dev->flags&IFF_NOARP)
return;

/*
* Allocate a buffer
*/

skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
+ dev->hard_header_len + 15, GFP_ATOMIC); 创建地址解析包缓冲区
if (skb == NULL)
return;

skb_reserve(skb, (dev->hard_header_len+15)&~15); 保留硬件帧头区域,16字节对齐
skb->nh.raw = skb->data; 设置包头指针
arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
; 分配地址解析包的数据区域
skb->dev = dev; 包的发送设备
skb->protocol = __constant_htons (ETH_P_ARP); 设置包的类型
if (src_hw == NULL)
src_hw = dev->dev_addr; 如果信源硬件地址为空, 取设备的硬件地址
if (dest_hw == NULL)
dest_hw = dev->broadcast; 如果信宿硬件地址为空, 取设备的广播地址

/*
* Fill the device header for the ARP frame
*/
if (dev->hard_header &&
dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0) 装配包的硬件帧头
goto out;

/*
* Fill out the arp protocol part.
*
* The arp hardware type should match the device type, except for FDDI,
* which (according to RFC 1390) should always equal 1 (Ethernet).
*/
/*
* Exceptions everywhere. AX.25 uses the AX.25 PID value not the
* DIX code for the protocol. Make these device structure fields.
*/
switch (dev->type) {
default:
arp->ar_hrd = htons(dev->type); 取设备类型作为地址解析包的地址类型
arp->ar_pro = __constant_htons(ETH_P_IP);
break;

#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
case ARPHRD_AX25:
arp->ar_hrd = __constant_htons(ARPHRD_AX25);
arp->ar_pro = __constant_htons(AX25_P_IP);
break;

#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
case ARPHRD_NETROM:
arp->ar_hrd = __constant_htons(ARPHRD_NETROM);
arp->ar_pro = __constant_htons(AX25_P_IP);
break;
#endif
#endif

#ifdef CONFIG_FDDI
case ARPHRD_FDDI:
arp->ar_hrd = __constant_htons(ARPHRD_ETHER);
arp->ar_pro = __constant_htons(ETH_P_IP);
break;
#endif
#ifdef CONFIG_TR
case ARPHRD_IEEE802_TR:
arp->ar_hrd = __constant_htons(ARPHRD_IEEE802);
arp->ar_pro = __constant_htons(ETH_P_IP);
break;
#endif
}

arp->ar_hln = dev->addr_len; 设置硬件地址长度
arp->ar_pln = 4; 设置协议地址长度
arp->ar_op = htons(type); 设置地址解析的操作码

arp_ptr=(unsigned char *)(arp+1); 指向包的参数区

memcpy(arp_ptr, src_hw, dev->addr_len); 设置源硬件地址
arp_ptr+=dev->addr_len;
memcpy(arp_ptr, &src_ip,4); 设置源IP地址
arp_ptr+=4;
if (target_hw != NULL)
memcpy(arp_ptr, target_hw, dev->addr_len);
else
memset(arp_ptr, 0, dev->addr_len);
arp_ptr+=dev->addr_len;
memcpy(arp_ptr, &dest_ip, 4); 设置目的IP地址
skb->dev = dev;

dev_queue_xmit(skb);
return;

out:
kfree_skb(skb);
}
static struct packet_type arp_packet_type =
{
__constant_htons(ETH_P_ARP),
NULL, /* All devices */
arp_rcv,
(void*)1,
NULL
};
static inline int skb_shared(struct sk_buff *skb)
{
return (atomic_read(&skb->users) != 1);
}
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, int pri)
{
if (skb_shared(skb)) {
struct sk_buff *nskb;
nskb = skb_clone(skb, pri);
kfree_skb(skb);
return nskb;
}
return skb;
}

#define IN_DEV_PROXY_ARP(in_dev) (ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp)
#define IN_DEV_FORWARD(in_dev) ((in_dev)->cnf.forwarding)

int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
struct arphdr *arp = skb->nh.arph;
unsigned char *arp_ptr= (unsigned char *)(arp+1);
struct rtable *rt;
unsigned char *sha, *tha;
u32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
struct in_device *in_dev = in_dev_get(dev);
struct neighbour *n;

/*
* The hardware length of the packet should match the hardware length
* of the device. Similarly, the hardware types should match. The
* device should be ARP-able. Also, if pln is not 4, then the lookup
* is not from an IP number. We can't currently handle this, so toss
* it.
*/
if (in_dev == NULL || 如果设备还没打开
arp->ar_hln != dev->addr_len || 如果硬件地址长度不匹配
dev->flags & IFF_NOARP || 如果设备无需arp
skb->pkt_type == PACKET_OTHERHOST || 如果是属于外部任备的包
skb->pkt_type == PACKET_LOOPBACK || 如果是回送包
arp->ar_pln != 4) 如果IP地址长度不等于4
goto out;

if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
goto out_of_mem;

switch (dev_type) {
default:
if (arp->ar_pro != __constant_htons(ETH_P_IP))
goto out;
if (htons(dev_type) != arp->ar_hrd)
goto out;
break;
#ifdef CONFIG_NET_ETHERNET
case ARPHRD_ETHER:
/*
* ETHERNET devices will accept ARP hardware types of either
* 1 (Ethernet) or 6 (IEEE 802.2).
*/
if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) &&
arp->ar_hrd != __constant_htons(ARPHRD_IEEE802))
goto out;
if (arp->ar_pro != __constant_htons(ETH_P_IP))
goto out;
break;
#endif
#ifdef CONFIG_TR
case ARPHRD_IEEE802_TR:
/*
* Token ring devices will accept ARP hardware types of either
* 1 (Ethernet) or 6 (IEEE 802.2).
*/
if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) &&
arp->ar_hrd != __constant_htons(ARPHRD_IEEE802))
goto out;
if (arp->ar_pro != __constant_htons(ETH_P_IP))
goto out;
break;
#endif
#ifdef CONFIG_FDDI
case ARPHRD_FDDI:
/*
* According to RFC 1390, FDDI devices should accept ARP hardware types
* of 1 (Ethernet). However, to be more robust, we'll accept hardware
* types of either 1 (Ethernet) or 6 (IEEE 802.2).
*/
if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) &&
arp->ar_hrd != __constant_htons(ARPHRD_IEEE802))
goto out;
if (arp->ar_pro != __constant_htons(ETH_P_IP))
goto out;
break;
#endif
#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
case ARPHRD_AX25:
if (arp->ar_pro != __constant_htons(AX25_P_IP))
goto out;
if (arp->ar_hrd != __constant_htons(ARPHRD_AX25))
goto out;
break;
#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
case ARPHRD_NETROM:
if (arp->ar_pro != __constant_htons(AX25_P_IP))
goto out;
if (arp->ar_hrd != __constant_htons(ARPHRD_NETROM))
goto out;
break;
#endif
#endif
}

/* Understand only these message types */

if (arp->ar_op != __constant_htons(ARPOP_REPLY) &&
arp->ar_op != __constant_htons(ARPOP_REQUEST))
goto out; 只处理"应答"和"请求"两种操作码

/*
* Extract fields
*/
sha=arp_ptr; 指向参数区源硬件地址
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4); 发送者IP地址
arp_ptr += 4;
tha=arp_ptr; 所(要)解析的硬件地址
arp_ptr += dev->addr_len;
memcpy(&tip, arp_ptr, 4); 取所(要)解析的IP地址
/*
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
if (LOOPBACK(tip) || MULTICAST(tip))
goto out;

/*
* Process entry. The idea here is we want to send a reply if it is a
* request for us or if it is a request for someone else that we hold
* a proxy for. We want to add an entry to our cache if it is a reply
* to us or if it is a request for our address.
* (The assumption for this last is that if someone is requesting our
* address, they are probably intending to talk to us, so it saves time
* if we cache their address. Their address is also probably not in
* our cache, since ours is not in their cache.)
*
* Putting this another way, we only care about replies if they are to
* us, in which case we add them to the cache. For requests, we care
* about those for us and those for our proxies. We reply to both,
* and in the case of requests for us we add the requester to the arp
* cache.
*/

/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) { 如果发送者IP地址为零
if (arp->ar_op == __constant_htons(ARPOP_REQUEST) &&
inet_addr_type(tip) == RTN_LOCAL) 如果所请要解析的IP地址是主机的IP地址
arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
goto out;
}

if (arp->ar_op == __constant_htons(ARPOP_REQUEST) &&
ip_route_input(skb, tip, sip, 0, dev) == 0) { 查询目的地址在输入设备dev上的路由

rt = (struct rtable*)skb->dst;
addr_type = rt->rt_type;

if (addr_type == RTN_LOCAL) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 将请求设备的邻居刷新为"过期"状态
if (n) {
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
neigh_release(n);
}
goto out;
} else if (IN_DEV_FORWARD(in_dev)) {
if ((rt->rt_flags&RTCF_DNAT) ||
(addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
(IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);

if (skb->stamp.tv_sec == 0 || 如果是proxy_queue的redo包
skb->pkt_type == PACKET_HOST || 如果是发向自已的包
in_dev->arp_parms->proxy_delay == 0) {
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
} else {
pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); 缓冲请求包,作延迟处理
in_dev_put(in_dev);
return 0;
}
goto out;
}
}
}
; 收到应答包或非已的请求包
/* Update our ARP tables */

n = __neigh_lookup(&arp_tbl, &sip, dev, 0); 寻找发送者IP的邻居

#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
if (n == NULL &&
arp->ar_op == __constant_htons(ARPOP_REPLY) &&
inet_addr_type(sip) == RTN_UNICAST)
n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
#endif

if (n) {
int state = NUD_REACHABLE;
int override = 0;

/* If several different ARP replies follows back-to-back,
use the FIRST one. It is possible, if several proxy
agents are active. Taking the first reply prevents
arp trashing and chooses the fastest router.
*/
if (jiffies - n->updated >= n->parms->locktime)
override = 1;

/* Broadcast replies and request packets
do not assert neighbour reachability.
*/
if (arp->ar_op != __constant_htons(ARPOP_REPLY) ||
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
neigh_update(n, sha, state, override, 1);
neigh_release(n);
}

out:
kfree_skb(skb);
if (in_dev)
in_dev_put(in_dev);
out_of_mem:
return 0;
}
struct neighbour * neigh_event_ns(struct neigh_table *tbl,
u8 *lladdr, void *saddr,
struct net_device *dev) 将邻居刷新为"过期"状态
{
struct neighbour *neigh;

neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len);
if (neigh)
neigh_update(neigh, lladdr, NUD_STALE, 1, 1);
return neigh;
}
static inline struct neighbour *
__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int
creat)
{
struct neighbour *n = neigh_lookup(tbl, pkey, dev);

if (n || !creat)
return n;

n = neigh_create(tbl, pkey, dev);
return IS_ERR(n) ? NULL : n;
}
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, int override, int
arp)
{
u8 old;
int err;
int notify = 0;
struct net_device *dev = neigh->dev;

write_lock_bh(&neigh->lock);
old = neigh->nud_state;

err = -EPERM;
if (arp && (old&(NUD_NOARP|NUD_PERMANENT)))
goto out;

if (!(new&NUD_VALID)) {
neigh_del_timer(neigh);
if (old&NUD_CONNECTED)
neigh_suspect(neigh);
neigh->nud_state = new;
err = 0;
notify = old&NUD_VALID;
goto out;
}

/* Compare new lladdr with cached one */
if (dev->addr_len == 0) {
/* First case: device needs no address. */
lladdr = neigh->ha;
} else if (lladdr) {
/* The second case: if something is already cached
and a new address is proposed:
- compare new & old
- if they are different, check override flag
*/
if (old&NUD_VALID) {
if (memcmp(lladdr, neigh->ha, dev->addr_len) == 0)
lladdr = neigh->ha;
else if (!override)
goto out;
}
} else {
/* No address is supplied; if we know something,
use it, otherwise discard the request.
*/
err = -EINVAL;
if (!(old&NUD_VALID))
goto out;
lladdr = neigh->ha;
}

neigh_sync(neigh);
old = neigh->nud_state;
if (new&NUD_CONNECTED)
neigh->confirmed = jiffies;
neigh->updated = jiffies;

/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
err = 0;
if (old&NUD_VALID) {
if (lladdr == neigh->ha)
if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED)))
goto out;
}
neigh_del_timer(neigh);
neigh->nud_state = new;
if (lladdr != neigh->ha) {
memcpy(&neigh->ha, lladdr, dev->addr_len);
neigh_update_hhs(neigh);
if (!(new&NUD_CONNECTED))
neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1);
#ifdef CONFIG_ARPD
notify = 1;
#endif
}
if (new == old)
goto out;
if (new&NUD_CONNECTED)
neigh_connect(neigh);
else
neigh_suspect(neigh);
if (!(old&NUD_VALID)) {
struct sk_buff *skb;

/* Again: avoid dead loop if something went wrong */

while (neigh->nud_state&NUD_VALID &&
(skb=__skb_dequeue(&neigh->arp_queue)) != NULL) {
struct neighbour *n1 = neigh;
write_unlock_bh(&neigh->lock);
/* On shaper/eql skb->dst->neighbour != neigh :( */
if (skb->dst && skb->dst->neighbour)
n1 = skb->dst->neighbour;
n1->output(skb);
write_lock_bh(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
}
out:
write_unlock_bh(&neigh->lock);
#ifdef CONFIG_ARPD
if (notify && neigh->parms->app_probes)
neigh_app_notify(neigh);
#endif
return err;
}
static void neigh_sync(struct neighbour *n)
{
unsigned long now = jiffies;
u8 state = n->nud_state;

ASSERT_WL(n);
if (state&(NUD_NOARP|NUD_PERMANENT))
return;
if (state&NUD_REACHABLE) {
if (now - n->confirmed > n->parms->reachable_time) {
n->nud_state = NUD_STALE;
neigh_suspect(n);
}
} else if (state&NUD_VALID) {
if (now - n->confirmed <>parms->reachable_time) {
neigh_del_timer(n);
n->nud_state = NUD_REACHABLE;
neigh_connect(n);
}
}
}
static void neigh_connect(struct neighbour *neigh)
{
struct hh_cache *hh;

NEIGH_PRINTK2("neigh %p is connected.\n", neigh);

ASSERT_WL(neigh);

neigh->output = neigh->ops->connected_output;

for (hh = neigh->hh; hh; hh = hh->hh_next)
hh->hh_output = neigh->ops->hh_output;
}
static void neigh_suspect(struct neighbour *neigh)
{
struct hh_cache *hh;

NEIGH_PRINTK2("neigh %p is suspecteded.\n", neigh);

ASSERT_WL(neigh);

neigh->output = neigh->ops->output;

for (hh = neigh->hh; hh; hh = hh->hh_next)
hh->hh_output = neigh->ops->output;
}
static __inline__ void neigh_update_hhs(struct neighbour *neigh)
{
struct hh_cache *hh;
void (*update)(struct hh_cache*, struct net_device*, unsigned char*) =
neigh->dev->header_cache_update;

if (update) {
for (hh=neigh->hh; hh; hh=hh->hh_next) {
write_lock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
write_unlock_bh(&hh->hh_lock);
}
}
}

struct pneigh_entry
{
struct pneigh_entry *next;
struct net_device *dev;
u8 key[0];
};
struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, int creat)
{
struct pneigh_entry *n;
u32 hash_val;
int key_len = tbl->key_len;

hash_val = *(u32*)(pkey + key_len - 4);
hash_val ^= (hash_val>>16);
hash_val ^= hash_val>>8;
hash_val ^= hash_val>>4;
hash_val &= PNEIGH_HASHMASK;

read_lock_bh(&tbl->lock);

for (n = tbl->phash_buckets[hash_val]; n; n = n->next) {
if (memcmp(n->key, pkey, key_len) == 0 &&
(n->dev == dev || !n->dev)) {
read_unlock_bh(&tbl->lock);
return n;
}
}
read_unlock_bh(&tbl->lock);
if (!creat)
return NULL;

n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
if (n == NULL)
return NULL;

memcpy(n->key, pkey, key_len);
n->dev = dev;

if (tbl->pconstructor && tbl->pconstructor(n)) {
kfree(n);
return NULL;
}

write_lock_bh(&tbl->lock);
n->next = tbl->phash_buckets[hash_val];
tbl->phash_buckets[hash_val] = n;
write_unlock_bh(&tbl->lock);
return n;
}
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb) 缓冲外部设备的arp请求包
{
unsigned long now = jiffies;
long sched_next = net_random()%p->proxy_delay; 下一延迟间隔(0.8秒以内)

if (tbl->proxy_queue.qlen > p->proxy_qlen) {
kfree_skb(skb);
return;
}
skb->stamp.tv_sec = 0;
skb->stamp.tv_usec = now + sched_next;

spin_lock(&tbl->proxy_queue.lock);
if (del_timer(&tbl->proxy_timer)) {
long tval = tbl->proxy_timer.expires - now; 取定时器剩余时间
if (tval < sched_next)
sched_next = tval; 取大者
}
dst_release(skb->dst);
skb->dst = NULL;
dev_hold(skb->dev);
__skb_queue_tail(&tbl->proxy_queue, skb);
mod_timer(&tbl->proxy_timer, now + sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
static void neigh_proxy_process(unsigned long arg)
{
struct neigh_table *tbl = (struct neigh_table *)arg;
long sched_next = 0;
unsigned long now = jiffies;
struct sk_buff *skb;

spin_lock(&tbl->proxy_queue.lock);

skb = tbl->proxy_queue.next;

while (skb != (struct sk_buff*)&tbl->proxy_queue) {
struct sk_buff *back = skb;
long tdif = back->stamp.tv_usec - now;

skb = skb->next;
if (tdif <= 0) {
struct net_device *dev = back->dev;
__skb_unlink(back, &tbl->proxy_queue);
if (tbl->proxy_redo && netif_running(dev))
tbl->proxy_redo(back);
else
kfree_skb(back);

dev_put(dev);
} else if (!sched_next || tdif < sched_next)
sched_next = tdif;
}
del_timer(&tbl->proxy_timer);
if (sched_next)
mod_timer(&tbl->proxy_timer, jiffies + sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
static void parp_redo(struct sk_buff *skb)
{
arp_rcv(skb, skb->dev, NULL);
}
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
dst_link_failure(skb);
kfree_skb(skb);
}
static inline void dst_link_failure(struct sk_buff *skb)
{
struct dst_entry * dst = skb->dst;
if (dst && dst->ops && dst->ops->link_failure)
dst->ops->link_failure(skb);
}


Trackback: http://tb.blog.csdn.net/TrackBack.aspx?PostId=117148

No comments:

如何发掘出更多退休的钱?

如何发掘出更多退休的钱? http://bbs.wenxuecity.com/bbs/tzlc/1328415.html 按照常规的说法,退休的收入必须得有退休前的80%,或者是4% withdrawal rule,而且每年还得要加2-3%对付通胀,这是一个很大...