]>
Commit | Line | Data |
---|---|---|
d2912cb1 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
c7232c99 PM |
2 | /* |
3 | * (C) 1999-2001 Paul `Rusty' Russell | |
5b1158e9 | 4 | * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> |
c7232c99 | 5 | * (C) 2011 Patrick McHardy <kaber@trash.net> |
5b1158e9 JK |
6 | */ |
7 | ||
5191d70f AS |
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
9 | ||
5b1158e9 JK |
10 | #include <linux/module.h> |
11 | #include <linux/types.h> | |
12 | #include <linux/timer.h> | |
13 | #include <linux/skbuff.h> | |
5a0e3ad6 | 14 | #include <linux/gfp.h> |
c7232c99 | 15 | #include <net/xfrm.h> |
dd6d2910 | 16 | #include <linux/siphash.h> |
c7232c99 | 17 | #include <linux/rtnetlink.h> |
5b1158e9 | 18 | |
5b1158e9 JK |
19 | #include <net/netfilter/nf_conntrack.h> |
20 | #include <net/netfilter/nf_conntrack_core.h> | |
5b1158e9 | 21 | #include <net/netfilter/nf_conntrack_helper.h> |
41d73ec0 | 22 | #include <net/netfilter/nf_conntrack_seqadj.h> |
5d0aa2cc | 23 | #include <net/netfilter/nf_conntrack_zones.h> |
40d102cd JS |
24 | #include <net/netfilter/nf_nat.h> |
25 | #include <net/netfilter/nf_nat_helper.h> | |
26 | #include <uapi/linux/netfilter/nf_nat.h> | |
5b1158e9 | 27 | |
1cd472bf FW |
28 | #include "nf_internals.h" |
29 | ||
8073e960 | 30 | static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; |
e1bf1687 | 31 | |
c7232c99 | 32 | static DEFINE_MUTEX(nf_nat_proto_mutex); |
1cd472bf | 33 | static unsigned int nat_net_id __read_mostly; |
a76ae1c8 | 34 | |
e1bf1687 FW |
35 | static struct hlist_head *nf_nat_bysource __read_mostly; |
36 | static unsigned int nf_nat_htable_size __read_mostly; | |
49ecc2e9 | 37 | static siphash_aligned_key_t nf_nat_hash_rnd; |
c7232c99 | 38 | |
1cd472bf FW |
39 | struct nf_nat_lookup_hook_priv { |
40 | struct nf_hook_entries __rcu *entries; | |
41 | ||
42 | struct rcu_head rcu_head; | |
43 | }; | |
44 | ||
45 | struct nf_nat_hooks_net { | |
46 | struct nf_hook_ops *nat_hook_ops; | |
47 | unsigned int users; | |
48 | }; | |
49 | ||
50 | struct nat_net { | |
51 | struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; | |
52 | }; | |
53 | ||
c7232c99 | 54 | #ifdef CONFIG_XFRM |
096d0906 FW |
55 | static void nf_nat_ipv4_decode_session(struct sk_buff *skb, |
56 | const struct nf_conn *ct, | |
57 | enum ip_conntrack_dir dir, | |
58 | unsigned long statusbit, | |
59 | struct flowi *fl) | |
60 | { | |
61 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; | |
62 | struct flowi4 *fl4 = &fl->u.ip4; | |
63 | ||
64 | if (ct->status & statusbit) { | |
65 | fl4->daddr = t->dst.u3.ip; | |
66 | if (t->dst.protonum == IPPROTO_TCP || | |
67 | t->dst.protonum == IPPROTO_UDP || | |
68 | t->dst.protonum == IPPROTO_UDPLITE || | |
69 | t->dst.protonum == IPPROTO_DCCP || | |
70 | t->dst.protonum == IPPROTO_SCTP) | |
71 | fl4->fl4_dport = t->dst.u.all; | |
72 | } | |
73 | ||
74 | statusbit ^= IPS_NAT_MASK; | |
75 | ||
76 | if (ct->status & statusbit) { | |
77 | fl4->saddr = t->src.u3.ip; | |
78 | if (t->dst.protonum == IPPROTO_TCP || | |
79 | t->dst.protonum == IPPROTO_UDP || | |
80 | t->dst.protonum == IPPROTO_UDPLITE || | |
81 | t->dst.protonum == IPPROTO_DCCP || | |
82 | t->dst.protonum == IPPROTO_SCTP) | |
83 | fl4->fl4_sport = t->src.u.all; | |
84 | } | |
85 | } | |
86 | ||
87 | static void nf_nat_ipv6_decode_session(struct sk_buff *skb, | |
88 | const struct nf_conn *ct, | |
89 | enum ip_conntrack_dir dir, | |
90 | unsigned long statusbit, | |
91 | struct flowi *fl) | |
92 | { | |
93 | #if IS_ENABLED(CONFIG_IPV6) | |
94 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; | |
95 | struct flowi6 *fl6 = &fl->u.ip6; | |
96 | ||
97 | if (ct->status & statusbit) { | |
98 | fl6->daddr = t->dst.u3.in6; | |
99 | if (t->dst.protonum == IPPROTO_TCP || | |
100 | t->dst.protonum == IPPROTO_UDP || | |
101 | t->dst.protonum == IPPROTO_UDPLITE || | |
102 | t->dst.protonum == IPPROTO_DCCP || | |
103 | t->dst.protonum == IPPROTO_SCTP) | |
104 | fl6->fl6_dport = t->dst.u.all; | |
105 | } | |
106 | ||
107 | statusbit ^= IPS_NAT_MASK; | |
108 | ||
109 | if (ct->status & statusbit) { | |
110 | fl6->saddr = t->src.u3.in6; | |
111 | if (t->dst.protonum == IPPROTO_TCP || | |
112 | t->dst.protonum == IPPROTO_UDP || | |
113 | t->dst.protonum == IPPROTO_UDPLITE || | |
114 | t->dst.protonum == IPPROTO_DCCP || | |
115 | t->dst.protonum == IPPROTO_SCTP) | |
116 | fl6->fl6_sport = t->src.u.all; | |
117 | } | |
118 | #endif | |
119 | } | |
120 | ||
c7232c99 PM |
121 | static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) |
122 | { | |
c7232c99 PM |
123 | const struct nf_conn *ct; |
124 | enum ip_conntrack_info ctinfo; | |
125 | enum ip_conntrack_dir dir; | |
126 | unsigned long statusbit; | |
127 | u8 family; | |
128 | ||
129 | ct = nf_ct_get(skb, &ctinfo); | |
130 | if (ct == NULL) | |
131 | return; | |
132 | ||
53890234 | 133 | family = nf_ct_l3num(ct); |
c7232c99 PM |
134 | dir = CTINFO2DIR(ctinfo); |
135 | if (dir == IP_CT_DIR_ORIGINAL) | |
136 | statusbit = IPS_DST_NAT; | |
137 | else | |
138 | statusbit = IPS_SRC_NAT; | |
139 | ||
096d0906 FW |
140 | switch (family) { |
141 | case NFPROTO_IPV4: | |
142 | nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); | |
143 | return; | |
144 | case NFPROTO_IPV6: | |
145 | nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); | |
146 | return; | |
147 | } | |
c7232c99 | 148 | } |
c7232c99 PM |
149 | #endif /* CONFIG_XFRM */ |
150 | ||
e1bf1687 FW |
151 | /* We keep an extra hash for each conntrack, for fast searching. */ |
152 | static unsigned int | |
d2966dc7 FW |
153 | hash_by_src(const struct net *net, |
154 | const struct nf_conntrack_zone *zone, | |
155 | const struct nf_conntrack_tuple *tuple) | |
5b1158e9 | 156 | { |
e1bf1687 | 157 | unsigned int hash; |
dd6d2910 FW |
158 | struct { |
159 | struct nf_conntrack_man src; | |
160 | u32 net_mix; | |
161 | u32 protonum; | |
d2966dc7 | 162 | u32 zone; |
dd6d2910 | 163 | } __aligned(SIPHASH_ALIGNMENT) combined; |
e1bf1687 FW |
164 | |
165 | get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); | |
7001c6d1 | 166 | |
dd6d2910 FW |
167 | memset(&combined, 0, sizeof(combined)); |
168 | ||
5b1158e9 | 169 | /* Original src, to ensure we map it consistently if poss. */ |
dd6d2910 | 170 | combined.src = tuple->src; |
d2966dc7 | 171 | combined.net_mix = net_hash_mix(net); |
dd6d2910 FW |
172 | combined.protonum = tuple->dst.protonum; |
173 | ||
d2966dc7 FW |
174 | /* Zone ID can be used provided its valid for both directions */ |
175 | if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) | |
176 | combined.zone = zone->id; | |
177 | ||
dd6d2910 | 178 | hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); |
8fc54f68 | 179 | |
e1bf1687 | 180 | return reciprocal_scale(hash, nf_nat_htable_size); |
5b1158e9 JK |
181 | } |
182 | ||
5b1158e9 | 183 | /* Is this tuple already taken? (not by us) */ |
472caa69 | 184 | static int |
5b1158e9 JK |
185 | nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, |
186 | const struct nf_conn *ignored_conntrack) | |
187 | { | |
188 | /* Conntrack tracking doesn't keep track of outgoing tuples; only | |
c7232c99 PM |
189 | * incoming ones. NAT means they don't have a fixed mapping, |
190 | * so we invert the tuple and look for the incoming reply. | |
191 | * | |
192 | * We could keep a separate hash if this proves too slow. | |
193 | */ | |
5b1158e9 JK |
194 | struct nf_conntrack_tuple reply; |
195 | ||
303e0c55 | 196 | nf_ct_invert_tuple(&reply, tuple); |
5b1158e9 JK |
197 | return nf_conntrack_tuple_taken(&reply, ignored_conntrack); |
198 | } | |
5b1158e9 | 199 | |
40e786bd FW |
200 | static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, |
201 | const struct nf_nat_range2 *range) | |
202 | { | |
203 | if (t->src.l3num == NFPROTO_IPV4) | |
204 | return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && | |
205 | ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); | |
206 | ||
207 | return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && | |
208 | ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; | |
209 | } | |
210 | ||
fe2d0020 FW |
211 | /* Is the manipable part of the tuple between min and max incl? */ |
212 | static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, | |
213 | enum nf_nat_manip_type maniptype, | |
214 | const union nf_conntrack_man_proto *min, | |
215 | const union nf_conntrack_man_proto *max) | |
216 | { | |
217 | __be16 port; | |
218 | ||
219 | switch (tuple->dst.protonum) { | |
35acfbab | 220 | case IPPROTO_ICMP: |
fe2d0020 FW |
221 | case IPPROTO_ICMPV6: |
222 | return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && | |
223 | ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); | |
224 | case IPPROTO_GRE: /* all fall though */ | |
225 | case IPPROTO_TCP: | |
226 | case IPPROTO_UDP: | |
227 | case IPPROTO_UDPLITE: | |
228 | case IPPROTO_DCCP: | |
229 | case IPPROTO_SCTP: | |
230 | if (maniptype == NF_NAT_MANIP_SRC) | |
231 | port = tuple->src.u.all; | |
232 | else | |
233 | port = tuple->dst.u.all; | |
234 | ||
235 | return ntohs(port) >= ntohs(min->all) && | |
236 | ntohs(port) <= ntohs(max->all); | |
237 | default: | |
238 | return true; | |
239 | } | |
240 | } | |
241 | ||
5b1158e9 | 242 | /* If we source map this tuple so reply looks like reply_tuple, will |
c7232c99 PM |
243 | * that meet the constraints of range. |
244 | */ | |
fe2d0020 | 245 | static int in_range(const struct nf_conntrack_tuple *tuple, |
2eb0f624 | 246 | const struct nf_nat_range2 *range) |
5b1158e9 | 247 | { |
5b1158e9 | 248 | /* If we are supposed to map IPs, then we must be in the |
c7232c99 PM |
249 | * range specified, otherwise let this drag us onto a new src IP. |
250 | */ | |
251 | if (range->flags & NF_NAT_RANGE_MAP_IPS && | |
40e786bd | 252 | !nf_nat_inet_in_range(tuple, range)) |
c7232c99 | 253 | return 0; |
5b1158e9 | 254 | |
fe2d0020 | 255 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) |
c7232c99 | 256 | return 1; |
5b1158e9 | 257 | |
fe2d0020 FW |
258 | return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, |
259 | &range->min_proto, &range->max_proto); | |
5b1158e9 JK |
260 | } |
261 | ||
262 | static inline int | |
263 | same_src(const struct nf_conn *ct, | |
264 | const struct nf_conntrack_tuple *tuple) | |
265 | { | |
266 | const struct nf_conntrack_tuple *t; | |
267 | ||
268 | t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | |
269 | return (t->dst.protonum == tuple->dst.protonum && | |
c7232c99 | 270 | nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && |
5b1158e9 JK |
271 | t->src.u.all == tuple->src.u.all); |
272 | } | |
273 | ||
274 | /* Only called for SRC manip */ | |
275 | static int | |
308ac914 DB |
276 | find_appropriate_src(struct net *net, |
277 | const struct nf_conntrack_zone *zone, | |
0c4c9288 | 278 | const struct nf_conntrack_tuple *tuple, |
5b1158e9 | 279 | struct nf_conntrack_tuple *result, |
2eb0f624 | 280 | const struct nf_nat_range2 *range) |
5b1158e9 | 281 | { |
d2966dc7 | 282 | unsigned int h = hash_by_src(net, zone, tuple); |
72b72949 | 283 | const struct nf_conn *ct; |
870190a9 | 284 | |
e1bf1687 FW |
285 | hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { |
286 | if (same_src(ct, tuple) && | |
287 | net_eq(net, nf_ct_net(ct)) && | |
288 | nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { | |
289 | /* Copy source part from reply tuple. */ | |
303e0c55 | 290 | nf_ct_invert_tuple(result, |
e1bf1687 FW |
291 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); |
292 | result->dst = tuple->dst; | |
293 | ||
fe2d0020 | 294 | if (in_range(result, range)) |
e1bf1687 FW |
295 | return 1; |
296 | } | |
97772bcd | 297 | } |
97772bcd | 298 | return 0; |
5b1158e9 JK |
299 | } |
300 | ||
301 | /* For [FUTURE] fragmentation handling, we want the least-used | |
c7232c99 PM |
302 | * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus |
303 | * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports | |
304 | * 1-65535, we don't do pro-rata allocation based on ports; we choose | |
305 | * the ip with the lowest src-ip/dst-ip/proto usage. | |
306 | */ | |
5b1158e9 | 307 | static void |
308ac914 DB |
308 | find_best_ips_proto(const struct nf_conntrack_zone *zone, |
309 | struct nf_conntrack_tuple *tuple, | |
2eb0f624 | 310 | const struct nf_nat_range2 *range, |
5b1158e9 JK |
311 | const struct nf_conn *ct, |
312 | enum nf_nat_manip_type maniptype) | |
313 | { | |
c7232c99 PM |
314 | union nf_inet_addr *var_ipp; |
315 | unsigned int i, max; | |
5b1158e9 | 316 | /* Host order */ |
c7232c99 PM |
317 | u32 minip, maxip, j, dist; |
318 | bool full_range; | |
5b1158e9 JK |
319 | |
320 | /* No IP mapping? Do nothing. */ | |
cbc9f2f4 | 321 | if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) |
5b1158e9 JK |
322 | return; |
323 | ||
cbc9f2f4 | 324 | if (maniptype == NF_NAT_MANIP_SRC) |
c7232c99 | 325 | var_ipp = &tuple->src.u3; |
5b1158e9 | 326 | else |
c7232c99 | 327 | var_ipp = &tuple->dst.u3; |
5b1158e9 JK |
328 | |
329 | /* Fast path: only one choice. */ | |
c7232c99 PM |
330 | if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { |
331 | *var_ipp = range->min_addr; | |
5b1158e9 JK |
332 | return; |
333 | } | |
334 | ||
c7232c99 PM |
335 | if (nf_ct_l3num(ct) == NFPROTO_IPV4) |
336 | max = sizeof(var_ipp->ip) / sizeof(u32) - 1; | |
337 | else | |
338 | max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; | |
339 | ||
5b1158e9 JK |
340 | /* Hashing source and destination IPs gives a fairly even |
341 | * spread in practice (if there are a small number of IPs | |
342 | * involved, there usually aren't that many connections | |
343 | * anyway). The consistency means that servers see the same | |
344 | * client coming from the same IP (some Internet Banking sites | |
c7232c99 PM |
345 | * like this), even across reboots. |
346 | */ | |
5693d68d | 347 | j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), |
c7232c99 | 348 | range->flags & NF_NAT_RANGE_PERSISTENT ? |
308ac914 | 349 | 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); |
c7232c99 PM |
350 | |
351 | full_range = false; | |
352 | for (i = 0; i <= max; i++) { | |
353 | /* If first bytes of the address are at the maximum, use the | |
354 | * distance. Otherwise use the full range. | |
355 | */ | |
356 | if (!full_range) { | |
357 | minip = ntohl((__force __be32)range->min_addr.all[i]); | |
358 | maxip = ntohl((__force __be32)range->max_addr.all[i]); | |
359 | dist = maxip - minip + 1; | |
360 | } else { | |
361 | minip = 0; | |
362 | dist = ~0; | |
363 | } | |
364 | ||
365 | var_ipp->all[i] = (__force __u32) | |
8fc54f68 | 366 | htonl(minip + reciprocal_scale(j, dist)); |
c7232c99 PM |
367 | if (var_ipp->all[i] != range->max_addr.all[i]) |
368 | full_range = true; | |
369 | ||
370 | if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) | |
371 | j ^= (__force u32)tuple->dst.u3.all[i]; | |
372 | } | |
5b1158e9 JK |
373 | } |
374 | ||
203f2e78 FW |
375 | /* Alter the per-proto part of the tuple (depending on maniptype), to |
376 | * give a unique tuple in the given range if possible. | |
377 | * | |
378 | * Per-protocol part of tuple is initialized to the incoming packet. | |
379 | */ | |
716b23c1 FW |
380 | static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, |
381 | const struct nf_nat_range2 *range, | |
382 | enum nf_nat_manip_type maniptype, | |
383 | const struct nf_conn *ct) | |
384 | { | |
385 | unsigned int range_size, min, max, i, attempts; | |
203f2e78 | 386 | __be16 *keyptr; |
716b23c1 FW |
387 | u16 off; |
388 | static const unsigned int max_attempts = 128; | |
389 | ||
203f2e78 | 390 | switch (tuple->dst.protonum) { |
954d8297 | 391 | case IPPROTO_ICMP: |
203f2e78 FW |
392 | case IPPROTO_ICMPV6: |
393 | /* id is same for either direction... */ | |
394 | keyptr = &tuple->src.u.icmp.id; | |
5bdac418 FW |
395 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { |
396 | min = 0; | |
397 | range_size = 65536; | |
398 | } else { | |
399 | min = ntohs(range->min_proto.icmp.id); | |
400 | range_size = ntohs(range->max_proto.icmp.id) - | |
401 | ntohs(range->min_proto.icmp.id) + 1; | |
402 | } | |
203f2e78 FW |
403 | goto find_free_id; |
404 | #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) | |
405 | case IPPROTO_GRE: | |
406 | /* If there is no master conntrack we are not PPTP, | |
407 | do not change tuples */ | |
408 | if (!ct->master) | |
409 | return; | |
410 | ||
411 | if (maniptype == NF_NAT_MANIP_SRC) | |
412 | keyptr = &tuple->src.u.gre.key; | |
413 | else | |
414 | keyptr = &tuple->dst.u.gre.key; | |
415 | ||
416 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { | |
417 | min = 1; | |
418 | range_size = 65535; | |
419 | } else { | |
420 | min = ntohs(range->min_proto.gre.key); | |
421 | range_size = ntohs(range->max_proto.gre.key) - min + 1; | |
422 | } | |
423 | goto find_free_id; | |
424 | #endif | |
954d8297 GS |
425 | case IPPROTO_UDP: |
426 | case IPPROTO_UDPLITE: | |
427 | case IPPROTO_TCP: | |
428 | case IPPROTO_SCTP: | |
429 | case IPPROTO_DCCP: | |
203f2e78 FW |
430 | if (maniptype == NF_NAT_MANIP_SRC) |
431 | keyptr = &tuple->src.u.all; | |
432 | else | |
433 | keyptr = &tuple->dst.u.all; | |
434 | ||
435 | break; | |
436 | default: | |
437 | return; | |
438 | } | |
716b23c1 FW |
439 | |
440 | /* If no range specified... */ | |
441 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { | |
442 | /* If it's dst rewrite, can't change port */ | |
443 | if (maniptype == NF_NAT_MANIP_DST) | |
444 | return; | |
445 | ||
203f2e78 | 446 | if (ntohs(*keyptr) < 1024) { |
716b23c1 | 447 | /* Loose convention: >> 512 is credential passing */ |
203f2e78 | 448 | if (ntohs(*keyptr) < 512) { |
716b23c1 FW |
449 | min = 1; |
450 | range_size = 511 - min + 1; | |
451 | } else { | |
452 | min = 600; | |
453 | range_size = 1023 - min + 1; | |
454 | } | |
455 | } else { | |
456 | min = 1024; | |
457 | range_size = 65535 - 1024 + 1; | |
458 | } | |
459 | } else { | |
460 | min = ntohs(range->min_proto.all); | |
461 | max = ntohs(range->max_proto.all); | |
462 | if (unlikely(max < min)) | |
463 | swap(max, min); | |
464 | range_size = max - min + 1; | |
465 | } | |
466 | ||
203f2e78 | 467 | find_free_id: |
716b23c1 | 468 | if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) |
203f2e78 | 469 | off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); |
716b23c1 FW |
470 | else |
471 | off = prandom_u32(); | |
472 | ||
473 | attempts = range_size; | |
474 | if (attempts > max_attempts) | |
475 | attempts = max_attempts; | |
476 | ||
477 | /* We are in softirq; doing a search of the entire range risks | |
478 | * soft lockup when all tuples are already used. | |
479 | * | |
480 | * If we can't find any free port from first offset, pick a new | |
481 | * one and try again, with ever smaller search window. | |
482 | */ | |
483 | another_round: | |
484 | for (i = 0; i < attempts; i++, off++) { | |
203f2e78 | 485 | *keyptr = htons(min + off % range_size); |
716b23c1 FW |
486 | if (!nf_nat_used_tuple(tuple, ct)) |
487 | return; | |
488 | } | |
489 | ||
490 | if (attempts >= range_size || attempts < 16) | |
491 | return; | |
492 | attempts /= 2; | |
493 | off = prandom_u32(); | |
494 | goto another_round; | |
495 | } | |
496 | ||
878aed8d FW |
497 | static bool tuple_force_port_remap(const struct nf_conntrack_tuple *tuple) |
498 | { | |
499 | u16 sp, dp; | |
500 | ||
501 | switch (tuple->dst.protonum) { | |
502 | case IPPROTO_TCP: | |
503 | sp = ntohs(tuple->src.u.tcp.port); | |
504 | dp = ntohs(tuple->dst.u.tcp.port); | |
505 | break; | |
506 | case IPPROTO_UDP: | |
507 | case IPPROTO_UDPLITE: | |
508 | sp = ntohs(tuple->src.u.udp.port); | |
509 | dp = ntohs(tuple->dst.u.udp.port); | |
510 | break; | |
511 | default: | |
512 | return false; | |
513 | } | |
514 | ||
515 | /* IANA: System port range: 1-1023, | |
516 | * user port range: 1024-49151, | |
517 | * private port range: 49152-65535. | |
518 | * | |
519 | * Linux default ephemeral port range is 32768-60999. | |
520 | * | |
521 | * Enforce port remapping if sport is significantly lower | |
522 | * than dport to prevent NAT port shadowing, i.e. | |
523 | * accidental match of 'new' inbound connection vs. | |
524 | * existing outbound one. | |
525 | */ | |
526 | return sp < 16384 && dp >= 32768; | |
527 | } | |
528 | ||
c7232c99 PM |
529 | /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, |
530 | * we change the source to map into the range. For NF_INET_PRE_ROUTING | |
6e23ae2a | 531 | * and NF_INET_LOCAL_OUT, we change the destination to map into the |
c7232c99 | 532 | * range. It might not be possible to get a unique tuple, but we try. |
5b1158e9 | 533 | * At worst (or if we race), we will end up with a final duplicate in |
05ba4c89 | 534 | * __nf_conntrack_confirm and drop the packet. */ |
5b1158e9 JK |
535 | static void |
536 | get_unique_tuple(struct nf_conntrack_tuple *tuple, | |
537 | const struct nf_conntrack_tuple *orig_tuple, | |
2eb0f624 | 538 | const struct nf_nat_range2 *range, |
5b1158e9 JK |
539 | struct nf_conn *ct, |
540 | enum nf_nat_manip_type maniptype) | |
541 | { | |
878aed8d | 542 | bool random_port = range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL; |
308ac914 | 543 | const struct nf_conntrack_zone *zone; |
0c4c9288 | 544 | struct net *net = nf_ct_net(ct); |
308ac914 DB |
545 | |
546 | zone = nf_ct_zone(ct); | |
5b1158e9 | 547 | |
878aed8d FW |
548 | if (maniptype == NF_NAT_MANIP_SRC && |
549 | !random_port && | |
550 | !ct->local_origin) | |
551 | random_port = tuple_force_port_remap(orig_tuple); | |
552 | ||
c7232c99 PM |
553 | /* 1) If this srcip/proto/src-proto-part is currently mapped, |
554 | * and that same mapping gives a unique tuple within the given | |
555 | * range, use that. | |
556 | * | |
557 | * This is only required for source (ie. NAT/masq) mappings. | |
558 | * So far, we don't do local source mappings, so multiple | |
559 | * manips not an issue. | |
560 | */ | |
878aed8d | 561 | if (maniptype == NF_NAT_MANIP_SRC && !random_port) { |
41a7cab6 | 562 | /* try the original tuple first */ |
fe2d0020 | 563 | if (in_range(orig_tuple, range)) { |
41a7cab6 CG |
564 | if (!nf_nat_used_tuple(orig_tuple, ct)) { |
565 | *tuple = *orig_tuple; | |
fe2d0020 | 566 | return; |
41a7cab6 | 567 | } |
fe2d0020 | 568 | } else if (find_appropriate_src(net, zone, |
c7232c99 | 569 | orig_tuple, tuple, range)) { |
0d53778e | 570 | pr_debug("get_unique_tuple: Found current src map\n"); |
0dbff689 | 571 | if (!nf_nat_used_tuple(tuple, ct)) |
fe2d0020 | 572 | return; |
5b1158e9 JK |
573 | } |
574 | } | |
575 | ||
c7232c99 | 576 | /* 2) Select the least-used IP/proto combination in the given range */ |
5b1158e9 | 577 | *tuple = *orig_tuple; |
5d0aa2cc | 578 | find_best_ips_proto(zone, tuple, range, ct, maniptype); |
5b1158e9 JK |
579 | |
580 | /* 3) The per-protocol part of the manip is made to map into | |
c7232c99 PM |
581 | * the range to make a unique tuple. |
582 | */ | |
5b1158e9 JK |
583 | |
584 | /* Only bother mapping if it's not already in range and unique */ | |
878aed8d | 585 | if (!random_port) { |
cbc9f2f4 | 586 | if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { |
2eb0f624 | 587 | if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && |
fe2d0020 | 588 | l4proto_in_range(tuple, maniptype, |
2eb0f624 TDT |
589 | &range->min_proto, |
590 | &range->max_proto) && | |
c7232c99 | 591 | (range->min_proto.all == range->max_proto.all || |
99ad3c53 | 592 | !nf_nat_used_tuple(tuple, ct))) |
fe2d0020 | 593 | return; |
99ad3c53 | 594 | } else if (!nf_nat_used_tuple(tuple, ct)) { |
fe2d0020 | 595 | return; |
99ad3c53 CG |
596 | } |
597 | } | |
5b1158e9 | 598 | |
2eb0f624 | 599 | /* Last chance: get protocol to try to obtain unique tuple. */ |
203f2e78 | 600 | nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); |
5b1158e9 JK |
601 | } |
602 | ||
f768e5bd FW |
603 | struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) |
604 | { | |
605 | struct nf_conn_nat *nat = nfct_nat(ct); | |
606 | if (nat) | |
607 | return nat; | |
608 | ||
609 | if (!nf_ct_is_confirmed(ct)) | |
610 | nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); | |
611 | ||
612 | return nat; | |
613 | } | |
614 | EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); | |
615 | ||
5b1158e9 JK |
616 | unsigned int |
617 | nf_nat_setup_info(struct nf_conn *ct, | |
2eb0f624 | 618 | const struct nf_nat_range2 *range, |
cc01dcbd | 619 | enum nf_nat_manip_type maniptype) |
5b1158e9 | 620 | { |
e1bf1687 | 621 | struct net *net = nf_ct_net(ct); |
5b1158e9 | 622 | struct nf_conntrack_tuple curr_tuple, new_tuple; |
2d59e5ca | 623 | |
d110a394 LZ |
624 | /* Can't setup nat info for confirmed ct. */ |
625 | if (nf_ct_is_confirmed(ct)) | |
626 | return NF_ACCEPT; | |
627 | ||
44d6e2f2 VR |
628 | WARN_ON(maniptype != NF_NAT_MANIP_SRC && |
629 | maniptype != NF_NAT_MANIP_DST); | |
75c26314 FW |
630 | |
631 | if (WARN_ON(nf_nat_initialized(ct, maniptype))) | |
632 | return NF_DROP; | |
5b1158e9 JK |
633 | |
634 | /* What we've got will look like inverse of reply. Normally | |
c7232c99 PM |
635 | * this is what is in the conntrack, except for prior |
636 | * manipulations (future optimization: if num_manips == 0, | |
637 | * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) | |
638 | */ | |
303e0c55 FW |
639 | nf_ct_invert_tuple(&curr_tuple, |
640 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); | |
5b1158e9 JK |
641 | |
642 | get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); | |
643 | ||
644 | if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { | |
645 | struct nf_conntrack_tuple reply; | |
646 | ||
647 | /* Alter conntrack table so will recognize replies. */ | |
303e0c55 | 648 | nf_ct_invert_tuple(&reply, &new_tuple); |
5b1158e9 JK |
649 | nf_conntrack_alter_reply(ct, &reply); |
650 | ||
651 | /* Non-atomic: we own this at the moment. */ | |
cbc9f2f4 | 652 | if (maniptype == NF_NAT_MANIP_SRC) |
5b1158e9 JK |
653 | ct->status |= IPS_SRC_NAT; |
654 | else | |
655 | ct->status |= IPS_DST_NAT; | |
41d73ec0 | 656 | |
ab6dd1be | 657 | if (nfct_help(ct) && !nfct_seqadj(ct)) |
4440a2ab GF |
658 | if (!nfct_seqadj_ext_add(ct)) |
659 | return NF_DROP; | |
5b1158e9 JK |
660 | } |
661 | ||
cbc9f2f4 | 662 | if (maniptype == NF_NAT_MANIP_SRC) { |
e1bf1687 | 663 | unsigned int srchash; |
8073e960 | 664 | spinlock_t *lock; |
e1bf1687 | 665 | |
d2966dc7 | 666 | srchash = hash_by_src(net, nf_ct_zone(ct), |
e1bf1687 | 667 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
b0ade851 | 668 | lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; |
8073e960 | 669 | spin_lock_bh(lock); |
e1bf1687 FW |
670 | hlist_add_head_rcu(&ct->nat_bysource, |
671 | &nf_nat_bysource[srchash]); | |
8073e960 | 672 | spin_unlock_bh(lock); |
5b1158e9 JK |
673 | } |
674 | ||
675 | /* It's done. */ | |
cbc9f2f4 | 676 | if (maniptype == NF_NAT_MANIP_DST) |
a7c2f4d7 | 677 | ct->status |= IPS_DST_NAT_DONE; |
5b1158e9 | 678 | else |
a7c2f4d7 | 679 | ct->status |= IPS_SRC_NAT_DONE; |
5b1158e9 JK |
680 | |
681 | return NF_ACCEPT; | |
682 | } | |
683 | EXPORT_SYMBOL(nf_nat_setup_info); | |
684 | ||
0eba801b PNA |
685 | static unsigned int |
686 | __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) | |
f59cb045 PNA |
687 | { |
688 | /* Force range to this IP; let proto decide mapping for | |
689 | * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). | |
690 | * Use reply in case it's already been mangled (eg local packet). | |
691 | */ | |
692 | union nf_inet_addr ip = | |
0eba801b | 693 | (manip == NF_NAT_MANIP_SRC ? |
f59cb045 PNA |
694 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : |
695 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); | |
2eb0f624 | 696 | struct nf_nat_range2 range = { |
f59cb045 PNA |
697 | .flags = NF_NAT_RANGE_MAP_IPS, |
698 | .min_addr = ip, | |
699 | .max_addr = ip, | |
700 | }; | |
0eba801b PNA |
701 | return nf_nat_setup_info(ct, &range, manip); |
702 | } | |
703 | ||
704 | unsigned int | |
705 | nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) | |
706 | { | |
707 | return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); | |
f59cb045 PNA |
708 | } |
709 | EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); | |
710 | ||
5b1158e9 JK |
711 | /* Do packet manipulations according to nf_nat_setup_info. */ |
712 | unsigned int nf_nat_packet(struct nf_conn *ct, | |
713 | enum ip_conntrack_info ctinfo, | |
714 | unsigned int hooknum, | |
3db05fea | 715 | struct sk_buff *skb) |
5b1158e9 | 716 | { |
368982cd | 717 | enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); |
5b1158e9 | 718 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); |
368982cd | 719 | unsigned int verdict = NF_ACCEPT; |
5b1158e9 | 720 | unsigned long statusbit; |
5b1158e9 | 721 | |
cbc9f2f4 | 722 | if (mtype == NF_NAT_MANIP_SRC) |
5b1158e9 JK |
723 | statusbit = IPS_SRC_NAT; |
724 | else | |
725 | statusbit = IPS_DST_NAT; | |
726 | ||
727 | /* Invert if this is reply dir. */ | |
728 | if (dir == IP_CT_DIR_REPLY) | |
729 | statusbit ^= IPS_NAT_MASK; | |
730 | ||
731 | /* Non-atomic: these bits don't change. */ | |
368982cd PNA |
732 | if (ct->status & statusbit) |
733 | verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); | |
5b1158e9 | 734 | |
368982cd | 735 | return verdict; |
5b1158e9 JK |
736 | } |
737 | EXPORT_SYMBOL_GPL(nf_nat_packet); | |
738 | ||
8e0538d8 FW |
739 | static bool in_vrf_postrouting(const struct nf_hook_state *state) |
740 | { | |
741 | #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) | |
742 | if (state->hook == NF_INET_POST_ROUTING && | |
743 | netif_is_l3_master(state->out)) | |
744 | return true; | |
745 | #endif | |
746 | return false; | |
747 | } | |
748 | ||
1f55236b FW |
749 | unsigned int |
750 | nf_nat_inet_fn(void *priv, struct sk_buff *skb, | |
9971a514 | 751 | const struct nf_hook_state *state) |
1f55236b FW |
752 | { |
753 | struct nf_conn *ct; | |
754 | enum ip_conntrack_info ctinfo; | |
755 | struct nf_conn_nat *nat; | |
756 | /* maniptype == SRC for postrouting. */ | |
757 | enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); | |
758 | ||
759 | ct = nf_ct_get(skb, &ctinfo); | |
760 | /* Can't track? It's not due to stress, or conntrack would | |
761 | * have dropped it. Hence it's the user's responsibilty to | |
762 | * packet filter it out, or implement conntrack/NAT for that | |
763 | * protocol. 8) --RR | |
764 | */ | |
8e0538d8 | 765 | if (!ct || in_vrf_postrouting(state)) |
1f55236b FW |
766 | return NF_ACCEPT; |
767 | ||
768 | nat = nfct_nat(ct); | |
769 | ||
770 | switch (ctinfo) { | |
771 | case IP_CT_RELATED: | |
772 | case IP_CT_RELATED_REPLY: | |
773 | /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ | |
774 | case IP_CT_NEW: | |
775 | /* Seen it before? This can happen for loopback, retrans, | |
776 | * or local packets. | |
777 | */ | |
778 | if (!nf_nat_initialized(ct, maniptype)) { | |
9971a514 FW |
779 | struct nf_nat_lookup_hook_priv *lpriv = priv; |
780 | struct nf_hook_entries *e = rcu_dereference(lpriv->entries); | |
1f55236b | 781 | unsigned int ret; |
9971a514 FW |
782 | int i; |
783 | ||
784 | if (!e) | |
785 | goto null_bind; | |
786 | ||
787 | for (i = 0; i < e->num_hook_entries; i++) { | |
788 | ret = e->hooks[i].hook(e->hooks[i].priv, skb, | |
789 | state); | |
790 | if (ret != NF_ACCEPT) | |
791 | return ret; | |
792 | if (nf_nat_initialized(ct, maniptype)) | |
793 | goto do_nat; | |
794 | } | |
795 | null_bind: | |
1f55236b FW |
796 | ret = nf_nat_alloc_null_binding(ct, state->hook); |
797 | if (ret != NF_ACCEPT) | |
798 | return ret; | |
799 | } else { | |
800 | pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", | |
801 | maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", | |
802 | ct, ct->status); | |
803 | if (nf_nat_oif_changed(state->hook, ctinfo, nat, | |
804 | state->out)) | |
805 | goto oif_changed; | |
806 | } | |
807 | break; | |
808 | default: | |
809 | /* ESTABLISHED */ | |
810 | WARN_ON(ctinfo != IP_CT_ESTABLISHED && | |
811 | ctinfo != IP_CT_ESTABLISHED_REPLY); | |
812 | if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) | |
813 | goto oif_changed; | |
814 | } | |
9971a514 | 815 | do_nat: |
1f55236b FW |
816 | return nf_nat_packet(ct, ctinfo, state->hook, skb); |
817 | ||
818 | oif_changed: | |
819 | nf_ct_kill_acct(ct, ctinfo, skb); | |
820 | return NF_DROP; | |
821 | } | |
822 | EXPORT_SYMBOL_GPL(nf_nat_inet_fn); | |
823 | ||
c7232c99 PM |
824 | struct nf_nat_proto_clean { |
825 | u8 l3proto; | |
826 | u8 l4proto; | |
c7232c99 PM |
827 | }; |
828 | ||
c2d421e1 FW |
829 | /* kill conntracks with affected NAT section */ |
830 | static int nf_nat_proto_remove(struct nf_conn *i, void *data) | |
5b1158e9 | 831 | { |
c7232c99 | 832 | const struct nf_nat_proto_clean *clean = data; |
c2d421e1 | 833 | |
c7232c99 PM |
834 | if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || |
835 | (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) | |
5b1158e9 JK |
836 | return 0; |
837 | ||
c2d421e1 | 838 | return i->status & IPS_NAT_MASK ? 1 : 0; |
c7232c99 | 839 | } |
5b1158e9 | 840 | |
8073e960 FW |
841 | static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) |
842 | { | |
843 | unsigned int h; | |
844 | ||
d2966dc7 | 845 | h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
b0ade851 | 846 | spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); |
8073e960 | 847 | hlist_del_rcu(&ct->nat_bysource); |
b0ade851 | 848 | spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); |
8073e960 FW |
849 | } |
850 | ||
945b2b2d FW |
851 | static int nf_nat_proto_clean(struct nf_conn *ct, void *data) |
852 | { | |
945b2b2d FW |
853 | if (nf_nat_proto_remove(ct, data)) |
854 | return 1; | |
855 | ||
2420770b | 856 | /* This module is being removed and conntrack has nat null binding. |
945b2b2d FW |
857 | * Remove it from bysource hash, as the table will be freed soon. |
858 | * | |
859 | * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() | |
860 | * will delete entry from already-freed table. | |
861 | */ | |
2420770b FW |
862 | if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) |
863 | __nf_nat_cleanup_conntrack(ct); | |
945b2b2d | 864 | |
945b2b2d FW |
865 | /* don't delete conntrack. Although that would make things a lot |
866 | * simpler, we'd end up flushing all conntracks on nat rmmod. | |
867 | */ | |
868 | return 0; | |
869 | } | |
870 | ||
25985edc | 871 | /* No one using conntrack by the time this called. */ |
d8a0509a YK |
872 | static void nf_nat_cleanup_conntrack(struct nf_conn *ct) |
873 | { | |
8073e960 FW |
874 | if (ct->status & IPS_SRC_NAT_DONE) |
875 | __nf_nat_cleanup_conntrack(ct); | |
2d59e5ca YK |
876 | } |
877 | ||
61eb3107 | 878 | static struct nf_ct_ext_type nat_extend __read_mostly = { |
d8a0509a YK |
879 | .len = sizeof(struct nf_conn_nat), |
880 | .align = __alignof__(struct nf_conn_nat), | |
881 | .destroy = nf_nat_cleanup_conntrack, | |
d8a0509a | 882 | .id = NF_CT_EXT_NAT, |
2d59e5ca YK |
883 | }; |
884 | ||
24de3d37 | 885 | #if IS_ENABLED(CONFIG_NF_CT_NETLINK) |
e6a7d3c0 PNA |
886 | |
887 | #include <linux/netfilter/nfnetlink.h> | |
888 | #include <linux/netfilter/nfnetlink_conntrack.h> | |
889 | ||
890 | static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { | |
891 | [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, | |
892 | [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, | |
893 | }; | |
894 | ||
76b90019 FW |
895 | static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], |
896 | struct nf_nat_range2 *range) | |
897 | { | |
898 | if (tb[CTA_PROTONAT_PORT_MIN]) { | |
899 | range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); | |
900 | range->max_proto.all = range->min_proto.all; | |
901 | range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; | |
902 | } | |
903 | if (tb[CTA_PROTONAT_PORT_MAX]) { | |
904 | range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); | |
905 | range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; | |
906 | } | |
907 | return 0; | |
908 | } | |
909 | ||
e6a7d3c0 PNA |
910 | static int nfnetlink_parse_nat_proto(struct nlattr *attr, |
911 | const struct nf_conn *ct, | |
2eb0f624 | 912 | struct nf_nat_range2 *range) |
e6a7d3c0 PNA |
913 | { |
914 | struct nlattr *tb[CTA_PROTONAT_MAX+1]; | |
e6a7d3c0 PNA |
915 | int err; |
916 | ||
8cb08174 JB |
917 | err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, |
918 | protonat_nla_policy, NULL); | |
e6a7d3c0 PNA |
919 | if (err < 0) |
920 | return err; | |
921 | ||
76b90019 | 922 | return nf_nat_l4proto_nlattr_to_range(tb, range); |
e6a7d3c0 PNA |
923 | } |
924 | ||
925 | static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { | |
c7232c99 PM |
926 | [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, |
927 | [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, | |
58a317f1 PM |
928 | [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, |
929 | [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, | |
329fb58a | 930 | [CTA_NAT_PROTO] = { .type = NLA_NESTED }, |
e6a7d3c0 PNA |
931 | }; |
932 | ||
096d0906 FW |
933 | static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], |
934 | struct nf_nat_range2 *range) | |
935 | { | |
936 | if (tb[CTA_NAT_V4_MINIP]) { | |
937 | range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); | |
938 | range->flags |= NF_NAT_RANGE_MAP_IPS; | |
939 | } | |
940 | ||
941 | if (tb[CTA_NAT_V4_MAXIP]) | |
942 | range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); | |
943 | else | |
944 | range->max_addr.ip = range->min_addr.ip; | |
945 | ||
946 | return 0; | |
947 | } | |
948 | ||
949 | static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], | |
950 | struct nf_nat_range2 *range) | |
951 | { | |
952 | if (tb[CTA_NAT_V6_MINIP]) { | |
953 | nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], | |
954 | sizeof(struct in6_addr)); | |
955 | range->flags |= NF_NAT_RANGE_MAP_IPS; | |
956 | } | |
957 | ||
958 | if (tb[CTA_NAT_V6_MAXIP]) | |
959 | nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], | |
960 | sizeof(struct in6_addr)); | |
961 | else | |
962 | range->max_addr = range->min_addr; | |
963 | ||
964 | return 0; | |
965 | } | |
966 | ||
e6a7d3c0 | 967 | static int |
39938324 | 968 | nfnetlink_parse_nat(const struct nlattr *nat, |
096d0906 | 969 | const struct nf_conn *ct, struct nf_nat_range2 *range) |
e6a7d3c0 PNA |
970 | { |
971 | struct nlattr *tb[CTA_NAT_MAX+1]; | |
972 | int err; | |
973 | ||
974 | memset(range, 0, sizeof(*range)); | |
975 | ||
8cb08174 JB |
976 | err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, |
977 | nat_nla_policy, NULL); | |
e6a7d3c0 PNA |
978 | if (err < 0) |
979 | return err; | |
980 | ||
096d0906 FW |
981 | switch (nf_ct_l3num(ct)) { |
982 | case NFPROTO_IPV4: | |
983 | err = nf_nat_ipv4_nlattr_to_range(tb, range); | |
984 | break; | |
985 | case NFPROTO_IPV6: | |
986 | err = nf_nat_ipv6_nlattr_to_range(tb, range); | |
987 | break; | |
988 | default: | |
989 | err = -EPROTONOSUPPORT; | |
990 | break; | |
991 | } | |
992 | ||
993 | if (err) | |
0eba801b | 994 | return err; |
e6a7d3c0 PNA |
995 | |
996 | if (!tb[CTA_NAT_PROTO]) | |
0eba801b | 997 | return 0; |
e6a7d3c0 | 998 | |
0eba801b | 999 | return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); |
e6a7d3c0 PNA |
1000 | } |
1001 | ||
0eba801b | 1002 | /* This function is called under rcu_read_lock() */ |
e6a7d3c0 PNA |
1003 | static int |
1004 | nfnetlink_parse_nat_setup(struct nf_conn *ct, | |
1005 | enum nf_nat_manip_type manip, | |
39938324 | 1006 | const struct nlattr *attr) |
e6a7d3c0 | 1007 | { |
2eb0f624 | 1008 | struct nf_nat_range2 range; |
c7232c99 | 1009 | int err; |
e6a7d3c0 | 1010 | |
0eba801b PNA |
1011 | /* Should not happen, restricted to creating new conntracks |
1012 | * via ctnetlink. | |
1013 | */ | |
1014 | if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) | |
1015 | return -EEXIST; | |
1016 | ||
0eba801b PNA |
1017 | /* No NAT information has been passed, allocate the null-binding */ |
1018 | if (attr == NULL) | |
7025bac4 | 1019 | return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; |
0eba801b | 1020 | |
096d0906 | 1021 | err = nfnetlink_parse_nat(attr, ct, &range); |
c7232c99 PM |
1022 | if (err < 0) |
1023 | return err; | |
e6a7d3c0 | 1024 | |
ecfcdfec | 1025 | return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; |
e6a7d3c0 PNA |
1026 | } |
1027 | #else | |
1028 | static int | |
1029 | nfnetlink_parse_nat_setup(struct nf_conn *ct, | |
1030 | enum nf_nat_manip_type manip, | |
39938324 | 1031 | const struct nlattr *attr) |
e6a7d3c0 PNA |
1032 | { |
1033 | return -EOPNOTSUPP; | |
1034 | } | |
1035 | #endif | |
1036 | ||
544d5c7d PNA |
1037 | static struct nf_ct_helper_expectfn follow_master_nat = { |
1038 | .name = "nat-follow-master", | |
1039 | .expectfn = nf_nat_follow_master, | |
1040 | }; | |
1041 | ||
d164385e | 1042 | int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, |
1cd472bf FW |
1043 | const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) |
1044 | { | |
1045 | struct nat_net *nat_net = net_generic(net, nat_net_id); | |
1046 | struct nf_nat_hooks_net *nat_proto_net; | |
1047 | struct nf_nat_lookup_hook_priv *priv; | |
1048 | unsigned int hooknum = ops->hooknum; | |
1049 | struct nf_hook_ops *nat_ops; | |
1050 | int i, ret; | |
1051 | ||
d164385e | 1052 | if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) |
1cd472bf FW |
1053 | return -EINVAL; |
1054 | ||
d164385e | 1055 | nat_proto_net = &nat_net->nat_proto_net[pf]; |
1cd472bf FW |
1056 | |
1057 | for (i = 0; i < ops_count; i++) { | |
1cd472bf FW |
1058 | if (orig_nat_ops[i].hooknum == hooknum) { |
1059 | hooknum = i; | |
1060 | break; | |
1061 | } | |
1062 | } | |
1063 | ||
1064 | if (WARN_ON_ONCE(i == ops_count)) | |
1065 | return -EINVAL; | |
1066 | ||
1067 | mutex_lock(&nf_nat_proto_mutex); | |
1068 | if (!nat_proto_net->nat_hook_ops) { | |
1069 | WARN_ON(nat_proto_net->users != 0); | |
1070 | ||
1071 | nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); | |
1072 | if (!nat_ops) { | |
1073 | mutex_unlock(&nf_nat_proto_mutex); | |
1074 | return -ENOMEM; | |
1075 | } | |
1076 | ||
1077 | for (i = 0; i < ops_count; i++) { | |
1078 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | |
1079 | if (priv) { | |
1080 | nat_ops[i].priv = priv; | |
1081 | continue; | |
1082 | } | |
1083 | mutex_unlock(&nf_nat_proto_mutex); | |
1084 | while (i) | |
1085 | kfree(nat_ops[--i].priv); | |
1086 | kfree(nat_ops); | |
1087 | return -ENOMEM; | |
1088 | } | |
1089 | ||
1090 | ret = nf_register_net_hooks(net, nat_ops, ops_count); | |
1091 | if (ret < 0) { | |
1092 | mutex_unlock(&nf_nat_proto_mutex); | |
1093 | for (i = 0; i < ops_count; i++) | |
1094 | kfree(nat_ops[i].priv); | |
1095 | kfree(nat_ops); | |
1096 | return ret; | |
1097 | } | |
1098 | ||
1099 | nat_proto_net->nat_hook_ops = nat_ops; | |
1100 | } | |
1101 | ||
1102 | nat_ops = nat_proto_net->nat_hook_ops; | |
1103 | priv = nat_ops[hooknum].priv; | |
1104 | if (WARN_ON_ONCE(!priv)) { | |
1105 | mutex_unlock(&nf_nat_proto_mutex); | |
1106 | return -EOPNOTSUPP; | |
1107 | } | |
1108 | ||
1109 | ret = nf_hook_entries_insert_raw(&priv->entries, ops); | |
1110 | if (ret == 0) | |
1111 | nat_proto_net->users++; | |
1112 | ||
1113 | mutex_unlock(&nf_nat_proto_mutex); | |
1114 | return ret; | |
1115 | } | |
1cd472bf | 1116 | |
d164385e FW |
1117 | void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, |
1118 | unsigned int ops_count) | |
1cd472bf FW |
1119 | { |
1120 | struct nat_net *nat_net = net_generic(net, nat_net_id); | |
1121 | struct nf_nat_hooks_net *nat_proto_net; | |
1122 | struct nf_nat_lookup_hook_priv *priv; | |
1123 | struct nf_hook_ops *nat_ops; | |
1124 | int hooknum = ops->hooknum; | |
1125 | int i; | |
1126 | ||
d164385e | 1127 | if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) |
1cd472bf FW |
1128 | return; |
1129 | ||
d164385e | 1130 | nat_proto_net = &nat_net->nat_proto_net[pf]; |
1cd472bf FW |
1131 | |
1132 | mutex_lock(&nf_nat_proto_mutex); | |
1133 | if (WARN_ON(nat_proto_net->users == 0)) | |
1134 | goto unlock; | |
1135 | ||
1136 | nat_proto_net->users--; | |
1137 | ||
1138 | nat_ops = nat_proto_net->nat_hook_ops; | |
1139 | for (i = 0; i < ops_count; i++) { | |
1140 | if (nat_ops[i].hooknum == hooknum) { | |
1141 | hooknum = i; | |
1142 | break; | |
1143 | } | |
1144 | } | |
1145 | if (WARN_ON_ONCE(i == ops_count)) | |
1146 | goto unlock; | |
1147 | priv = nat_ops[hooknum].priv; | |
1148 | nf_hook_entries_delete_raw(&priv->entries, ops); | |
1149 | ||
1150 | if (nat_proto_net->users == 0) { | |
1151 | nf_unregister_net_hooks(net, nat_ops, ops_count); | |
1152 | ||
1153 | for (i = 0; i < ops_count; i++) { | |
1154 | priv = nat_ops[i].priv; | |
1155 | kfree_rcu(priv, rcu_head); | |
1156 | } | |
1157 | ||
1158 | nat_proto_net->nat_hook_ops = NULL; | |
1159 | kfree(nat_ops); | |
1160 | } | |
1161 | unlock: | |
1162 | mutex_unlock(&nf_nat_proto_mutex); | |
1163 | } | |
1cd472bf FW |
1164 | |
1165 | static struct pernet_operations nat_net_ops = { | |
1166 | .id = &nat_net_id, | |
1167 | .size = sizeof(struct nat_net), | |
1168 | }; | |
1169 | ||
285c8a7a | 1170 | static const struct nf_nat_hook nat_hook = { |
2c205dd3 PNA |
1171 | .parse_nat_setup = nfnetlink_parse_nat_setup, |
1172 | #ifdef CONFIG_XFRM | |
1173 | .decode_session = __nf_nat_decode_session, | |
1174 | #endif | |
368982cd | 1175 | .manip_pkt = nf_nat_manip_pkt, |
2c205dd3 PNA |
1176 | }; |
1177 | ||
5b1158e9 JK |
1178 | static int __init nf_nat_init(void) |
1179 | { | |
8073e960 | 1180 | int ret, i; |
2d59e5ca | 1181 | |
e1bf1687 FW |
1182 | /* Leave them the same for the moment. */ |
1183 | nf_nat_htable_size = nf_conntrack_htable_size; | |
b0ade851 GU |
1184 | if (nf_nat_htable_size < CONNTRACK_LOCKS) |
1185 | nf_nat_htable_size = CONNTRACK_LOCKS; | |
e1bf1687 FW |
1186 | |
1187 | nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); | |
1188 | if (!nf_nat_bysource) | |
1189 | return -ENOMEM; | |
a76ae1c8 | 1190 | |
2d59e5ca YK |
1191 | ret = nf_ct_extend_register(&nat_extend); |
1192 | if (ret < 0) { | |
285189c7 | 1193 | kvfree(nf_nat_bysource); |
5191d70f | 1194 | pr_err("Unable to register extension\n"); |
2d59e5ca YK |
1195 | return ret; |
1196 | } | |
5b1158e9 | 1197 | |
b0ade851 | 1198 | for (i = 0; i < CONNTRACK_LOCKS; i++) |
8073e960 FW |
1199 | spin_lock_init(&nf_nat_locks[i]); |
1200 | ||
1cd472bf FW |
1201 | ret = register_pernet_subsys(&nat_net_ops); |
1202 | if (ret < 0) { | |
1203 | nf_ct_extend_unregister(&nat_extend); | |
869f4fda | 1204 | kvfree(nf_nat_bysource); |
1cd472bf FW |
1205 | return ret; |
1206 | } | |
1207 | ||
c7232c99 | 1208 | nf_ct_helper_expectfn_register(&follow_master_nat); |
5b1158e9 | 1209 | |
2c205dd3 PNA |
1210 | WARN_ON(nf_nat_hook != NULL); |
1211 | RCU_INIT_POINTER(nf_nat_hook, &nat_hook); | |
1212 | ||
5b1158e9 JK |
1213 | return 0; |
1214 | } | |
1215 | ||
5b1158e9 JK |
1216 | static void __exit nf_nat_cleanup(void) |
1217 | { | |
8f23f35f | 1218 | struct nf_nat_proto_clean clean = {}; |
c7232c99 | 1219 | |
8f23f35f FW |
1220 | nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); |
1221 | ||
2d59e5ca | 1222 | nf_ct_extend_unregister(&nat_extend); |
544d5c7d | 1223 | nf_ct_helper_expectfn_unregister(&follow_master_nat); |
2c205dd3 PNA |
1224 | RCU_INIT_POINTER(nf_nat_hook, NULL); |
1225 | ||
e1bf1687 | 1226 | synchronize_net(); |
285189c7 | 1227 | kvfree(nf_nat_bysource); |
1cd472bf | 1228 | unregister_pernet_subsys(&nat_net_ops); |
5b1158e9 JK |
1229 | } |
1230 | ||
1231 | MODULE_LICENSE("GPL"); | |
1232 | ||
1233 | module_init(nf_nat_init); | |
1234 | module_exit(nf_nat_cleanup); |