使用策略路由实现涉及getsockopt(…, SO_ORIGINAL_DST, …)程序的远程调用（Redsocks, ss-redir之类的远程调用）

Redsocks/ss-redir的实现

DNAT与REDIRECT模块在iptables target extension之列。

这两个模块都能根据参数，修改数据包的header中的目的IP和端口。

既然修改，那就不是附加，毕竟网络层传输层header的内容怎么可能随随便便增删呢，这意味着，数据包的真实目的地会被完完全全地抹掉！

Redsocks/ss-redir收到这样的数据包后，不做特殊处理，何以知道这数据包原来想发给谁呢？

这些程序之所以能正常工作，得益于NAT的透明性，被NAT处理的数据包，在内核中均有所记录，NAT后的正常通讯由NAT发起者维持，Redsocks/ss-redir这类程序便是通过内核中的记录得知真实目的地的。

这个原地址的获取可以通过系统调用实现：

       #include <sys/types.h>
       #include <sys/socket.h>

       int getsockopt(int sockfd, int level, int optname,
                      void *optval, socklen_t *optlen);

#include <sys/types.h>

#include <sys/socket.h>

int getsockopt(int sockfd, int level, int optname,

void *optval, socklen_t *optlen);

optname需要传值SO_ORIGINAL_DST。

Redsocks的调用见base.c的223行：

/* redsocks - transparent TCP-to-proxy redirector
 * Copyright (C) 2007-2011 Leonid Evdokimov <leon@darkk.net.ru>
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License.  You may obtain a copy
 * of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <sys/stat.h>
#include <sys/resource.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <pwd.h>
#include <grp.h>
#include <stdlib.h>
#include "config.h"
#if defined USE_IPTABLES
# include <limits.h>
# include <linux/netfilter_ipv4.h>
#endif
#if defined USE_PF
# include <net/if.h>
# include <net/pfvar.h>
# include <sys/ioctl.h>
# include <errno.h>
#endif
#include "log.h"
#include "main.h"
#include "parser.h"
#include "redsocks.h"

typedef struct redirector_subsys_t {
	int (*init)();
	void (*fini)();
	int (*getdestaddr)(int fd, const struct sockaddr_in *client, const struct sockaddr_in *bindaddr, struct sockaddr_in *destaddr);
	const char *name;
	// some subsystems may store data here:
	int private;
} redirector_subsys;

typedef struct base_instance_t {
	int configured;
	char *chroot;
	char *user;
	char *group;
	char *redirector_name;
	redirector_subsys *redirector;
	char *log_name;
	bool log_debug;
	bool log_info;
	bool daemon;
#if defined(TCP_KEEPIDLE) && defined(TCP_KEEPCNT) && defined(TCP_KEEPINTVL)
	uint16_t tcp_keepalive_time;
	uint16_t tcp_keepalive_probes;
	uint16_t tcp_keepalive_intvl;
#endif
	uint32_t rlimit_nofile;
	uint32_t redsocks_conn_max;
	uint32_t connpres_idle_timeout;
	uint32_t max_accept_backoff_ms;
} base_instance;

static base_instance instance;

#if defined __FreeBSD__ || defined USE_PF
static int redir_open_private(const char *fname, int flags)
{
	int fd = open(fname, flags);
	if (fd < 0) {
		log_errno(LOG_ERR, "open(%s)", fname);
		return -1;
	}
	instance.redirector->private = fd;
	return 0;
}

static void redir_close_private()
{
	close(instance.redirector->private);
	instance.redirector->private = -1;
}
#endif

#ifdef __FreeBSD__
static int redir_init_ipf()
{
#ifdef IPNAT_NAME
	const char *fname = IPNAT_NAME;
#else
	const char *fname = IPL_NAME;
#endif
	return redir_open_private(fname, O_RDONLY);
}

static int getdestaddr_ipf(int fd, const struct sockaddr_in *client, const struct sockaddr_in *bindaddr, struct sockaddr_in *destaddr)
{
	int natfd = instance.redirector->private;
	struct natlookup natLookup;
	int x;
#if defined(IPFILTER_VERSION) && (IPFILTER_VERSION >= 4000027)
	struct ipfobj obj;
#else
	static int siocgnatl_cmd = SIOCGNATL & 0xff;
#endif

#if defined(IPFILTER_VERSION) && (IPFILTER_VERSION >= 4000027)
	obj.ipfo_rev = IPFILTER_VERSION;
	obj.ipfo_size = sizeof(natLookup);
	obj.ipfo_ptr = &natLookup;
	obj.ipfo_type = IPFOBJ_NATLOOKUP;
	obj.ipfo_offset = 0;
#endif

	natLookup.nl_inport = bindaddr->sin_port;
	natLookup.nl_outport = client->sin_port;
	natLookup.nl_inip = bindaddr->sin_addr;
	natLookup.nl_outip = client->sin_addr;
	natLookup.nl_flags = IPN_TCP;
#if defined(IPFILTER_VERSION) && (IPFILTER_VERSION >= 4000027)
	x = ioctl(natfd, SIOCGNATL, &obj);
#else
	/*
	 * IP-Filter changed the type for SIOCGNATL between
	 * 3.3 and 3.4.  It also changed the cmd value for
	 * SIOCGNATL, so at least we can detect it.  We could
	 * put something in configure and use ifdefs here, but
	 * this seems simpler.
	 */
	if (63 == siocgnatl_cmd) {
		struct natlookup *nlp = &natLookup;
		x = ioctl(natfd, SIOCGNATL, &nlp);
	} else {
		x = ioctl(natfd, SIOCGNATL, &natLookup);
	}
#endif
	if (x < 0) {
		if (errno != ESRCH)
			log_errno(LOG_WARNING, "ioctl(SIOCGNATL)\n");
		return -1;
	} else {
		destaddr->sin_family = AF_INET;
		destaddr->sin_port = natLookup.nl_realport;
		destaddr->sin_addr = natLookup.nl_realip;
		return 0;
	}
}
#endif

#ifdef USE_PF
static int redir_init_pf()
{
	return redir_open_private("/dev/pf", O_RDWR);
}

static int getdestaddr_pf(
		int fd, const struct sockaddr_in *client, const struct sockaddr_in *bindaddr,
		struct sockaddr_in *destaddr)
{
	int pffd = instance.redirector->private;
	struct pfioc_natlook nl;
	int saved_errno;
	char clientaddr_str[INET6_ADDRSTRLEN], bindaddr_str[INET6_ADDRSTRLEN];

	memset(&nl, 0, sizeof(struct pfioc_natlook));
	nl.saddr.v4.s_addr = client->sin_addr.s_addr;
	nl.sport = client->sin_port;
	nl.daddr.v4.s_addr = bindaddr->sin_addr.s_addr;
	nl.dport = bindaddr->sin_port;
	nl.af = AF_INET;
	nl.proto = IPPROTO_TCP;
	nl.direction = PF_OUT;

	if (ioctl(pffd, DIOCNATLOOK, &nl) != 0) {
		if (errno == ENOENT) {
			nl.direction = PF_IN; // required to redirect local packets
			if (ioctl(pffd, DIOCNATLOOK, &nl) != 0) {
				goto fail;
			}
		}
		else {
			goto fail;
		}
	}
	destaddr->sin_family = AF_INET;
	destaddr->sin_port = nl.rdport;
	destaddr->sin_addr = nl.rdaddr.v4;
	return 0;

fail:
	saved_errno = errno;
	if (!inet_ntop(client->sin_family, &client->sin_addr, clientaddr_str, sizeof(clientaddr_str)))
		strncpy(clientaddr_str, "???", sizeof(clientaddr_str));
	if (!inet_ntop(bindaddr->sin_family, &bindaddr->sin_addr, bindaddr_str, sizeof(bindaddr_str)))
		strncpy(bindaddr_str, "???", sizeof(bindaddr_str));

	errno = saved_errno;
	log_errno(LOG_WARNING, "ioctl(DIOCNATLOOK {src=%s:%d, dst=%s:%d})",
			  clientaddr_str, ntohs(nl.sport), bindaddr_str, ntohs(nl.dport));
	return -1;
}
#endif

#ifdef USE_IPTABLES
static int getdestaddr_iptables(int fd, const struct sockaddr_in *client, const struct sockaddr_in *bindaddr, struct sockaddr_in *destaddr)
{
	socklen_t socklen = sizeof(*destaddr);
	int error;

	error = getsockopt(fd, SOL_IP, SO_ORIGINAL_DST, destaddr, &socklen);
	if (error) {
		log_errno(LOG_WARNING, "getsockopt");
		return -1;
	}
	return 0;
}
#endif

static int getdestaddr_generic(int fd, const struct sockaddr_in *client, const struct sockaddr_in *bindaddr, struct sockaddr_in *destaddr)
{
	socklen_t socklen = sizeof(*destaddr);
	int error;

	error = getsockname(fd, (struct sockaddr*)destaddr, &socklen);
	if (error) {
		log_errno(LOG_WARNING, "getsockopt");
		return -1;
	}
	return 0;
}

int getdestaddr(int fd, const struct sockaddr_in *client, const struct sockaddr_in *bindaddr, struct sockaddr_in *destaddr)
{
	return instance.redirector->getdestaddr(fd, client, bindaddr, destaddr);
}

int apply_tcp_keepalive(int fd)
{
	struct { int level, option, value; } opt[] = {
		{ SOL_SOCKET, SO_KEEPALIVE, 1 },
		{ IPPROTO_TCP, TCP_KEEPIDLE, instance.tcp_keepalive_time },
		{ IPPROTO_TCP, TCP_KEEPCNT, instance.tcp_keepalive_probes },
		{ IPPROTO_TCP, TCP_KEEPINTVL, instance.tcp_keepalive_intvl },
	};
	for (int i = 0; i < SIZEOF_ARRAY(opt); ++i) {
		if (opt[i].value) {
			int error = setsockopt(fd, opt[i].level, opt[i].option, &opt[i].value, sizeof(opt[i].value));
			if (error) {
				log_errno(LOG_WARNING, "setsockopt(%d, %d, %d, &%d, %zu)", fd, opt[i].level, opt[i].option, opt[i].value, sizeof(opt[i].value));
				return -1;
			}
		}
	}
	return 0;
}

uint32_t max_accept_backoff_ms()
{
	return instance.max_accept_backoff_ms;
}

uint32_t redsocks_conn_max()
{
	return instance.redsocks_conn_max;
}

uint32_t connpres_idle_timeout()
{
	return instance.connpres_idle_timeout;
}

static redirector_subsys redirector_subsystems[] =
{
#ifdef __FreeBSD__
	{ .name = "ipf", .init = redir_init_ipf, .fini = redir_close_private, .getdestaddr = getdestaddr_ipf },
#endif
#ifdef USE_PF
	{ .name = "pf",  .init = redir_init_pf,  .fini = redir_close_private, .getdestaddr = getdestaddr_pf },
#endif
#ifdef USE_IPTABLES
	{ .name = "iptables", .getdestaddr = getdestaddr_iptables },
#endif
	{ .name = "generic",  .getdestaddr = getdestaddr_generic  },
};

/***********************************************************************
 * `base` config parsing
 */
static parser_entry base_entries[] =
{
	{ .key = "chroot",     .type = pt_pchar,   .addr = &instance.chroot },
	{ .key = "user",       .type = pt_pchar,   .addr = &instance.user },
	{ .key = "group",      .type = pt_pchar,   .addr = &instance.group },
	{ .key = "redirector", .type = pt_pchar,   .addr = &instance.redirector_name },
	{ .key = "log",        .type = pt_pchar,   .addr = &instance.log_name },
	{ .key = "log_debug",  .type = pt_bool,    .addr = &instance.log_debug },
	{ .key = "log_info",   .type = pt_bool,    .addr = &instance.log_info },
	{ .key = "daemon",     .type = pt_bool,    .addr = &instance.daemon },
#if defined(TCP_KEEPIDLE) && defined(TCP_KEEPCNT) && defined(TCP_KEEPINTVL)
	{ .key = "tcp_keepalive_time",   .type = pt_uint16, .addr = &instance.tcp_keepalive_time },
	{ .key = "tcp_keepalive_probes", .type = pt_uint16, .addr = &instance.tcp_keepalive_probes },
	{ .key = "tcp_keepalive_intvl",  .type = pt_uint16, .addr = &instance.tcp_keepalive_intvl },
#endif
	{ .key = "rlimit_nofile", .type = pt_uint32, .addr = &instance.rlimit_nofile },
	{ .key = "redsocks_conn_max", .type = pt_uint32, .addr = &instance.redsocks_conn_max },
	{ .key = "connpres_idle_timeout", .type = pt_uint32, .addr = &instance.connpres_idle_timeout },
	{ .key = "max_accept_backoff", .type = pt_uint32, .addr = &instance.max_accept_backoff_ms },
	{ }
};

static int base_onenter(parser_section *section)
{
	if (instance.configured) {
		parser_error(section->context, "only one instance of base is valid");
		return -1;
	}
	memset(&instance, 0, sizeof(instance));
	instance.configured = 1;
	instance.max_accept_backoff_ms = 60000;
	instance.connpres_idle_timeout = 7440;
	return 0;
}

static int base_onexit(parser_section *section)
{
	if (!instance.max_accept_backoff_ms) {
		parser_error(section->context, "`max_accept_backoff` must be positive, 0 ms is too low");
		return -1;
	}

	if (instance.redirector_name) {
		redirector_subsys *ss;
		FOREACH(ss, redirector_subsystems) {
			if (!strcmp(ss->name, instance.redirector_name)) {
				instance.redirector = ss;
				instance.redirector->private = -1;
				break;
			}
		}
		if (!instance.redirector) {
			parser_error(section->context, "invalid `redirector` set <%s>", instance.redirector_name);
			return -1;
		}
	}
	else {
		parser_error(section->context, "no `redirector` set");
		return -1;
	}

	return 0;
}

static parser_section base_conf_section =
{
	.name    = "base",
	.entries = base_entries,
	.onenter = base_onenter,
	.onexit  = base_onexit
};

/***********************************************************************
 * `base` initialization
 */
static int base_fini();

static int base_init()
{
	uid_t uid = -1;
	gid_t gid = -1;
	int devnull = -1;

	if (!instance.configured) {
		log_error(LOG_ERR, "there is no configured instance of `base`, check config file");
		return -1;
	}

	if (instance.redirector->init && instance.redirector->init() < 0)
		return -1;

	if (instance.user) {
		struct passwd *pw = getpwnam(instance.user);
		if (pw == NULL) {
			log_errno(LOG_ERR, "getpwnam(%s)", instance.user);
			goto fail;
		}
		uid = pw->pw_uid;
	}

	if (instance.group) {
		struct group *gr = getgrnam(instance.group);
		if (gr == NULL) {
			log_errno(LOG_ERR, "getgrnam(%s)", instance.group);
			goto fail;
		}
		gid = gr->gr_gid;
	}

	if (log_preopen(
			instance.log_name ? instance.log_name : instance.daemon ? "syslog:daemon" : "stderr",
			instance.log_debug,
			instance.log_info
	) < 0 ) {
		goto fail;
	}

	if (instance.rlimit_nofile) {
		struct rlimit rlmt;
		rlmt.rlim_cur = instance.rlimit_nofile;
		rlmt.rlim_max = instance.rlimit_nofile;
		if (setrlimit(RLIMIT_NOFILE, &rlmt) != 0) {
			log_errno(LOG_ERR, "setrlimit(RLIMIT_NOFILE, %u)", instance.rlimit_nofile);
			goto fail;
		}
	} else {
		struct rlimit rlmt;
		if (getrlimit(RLIMIT_NOFILE, &rlmt) != 0) {
			log_errno(LOG_ERR, "getrlimit(RLIMIT_NOFILE)");
			goto fail;
		}
		instance.rlimit_nofile = rlmt.rlim_cur;
	}

	if (!instance.redsocks_conn_max) {
		instance.redsocks_conn_max = (instance.rlimit_nofile - instance.rlimit_nofile / 4)
			/ (redsocks_has_splice_instance() ? 6 : 2);
	}

	if (instance.daemon) {
		devnull = open("/dev/null", O_RDWR);
		if (devnull == -1) {
			log_errno(LOG_ERR, "open(\"/dev/null\", O_RDWR");
			goto fail;
		}
	}

	if (instance.chroot) {
		if (chroot(instance.chroot) < 0) {
			log_errno(LOG_ERR, "chroot(%s)", instance.chroot);
			goto fail;
		}
	}

	if (instance.daemon || instance.chroot) {
		if (chdir("/") < 0) {
			log_errno(LOG_ERR, "chdir(\"/\")");
			goto fail;
		}
	}

	if (instance.group) {
		if (setgid(gid) < 0) {
			log_errno(LOG_ERR, "setgid(%i)", gid);
			goto fail;
		}
	}

	if (instance.user) {
		if (setuid(uid) < 0) {
			log_errno(LOG_ERR, "setuid(%i)", uid);
			goto fail;
		}
	}

	if (instance.daemon) {
		switch (fork()) {
		case -1: // error
			log_errno(LOG_ERR, "fork()");
			goto fail;
		case 0:  // child
			break;
		default: // parent, pid is returned
			exit(EXIT_SUCCESS);
		}
	}

	log_open(); // child has nothing to do with TTY

	if (instance.daemon) {
		if (setsid() < 0) {
			log_errno(LOG_ERR, "setsid()");
			goto fail;
		}

		int fds[] = { STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO };
		int *pfd;
		FOREACH(pfd, fds)
			if (dup2(devnull, *pfd) < 0) {
				log_errno(LOG_ERR, "dup2(devnull, %i)", *pfd);
				goto fail;
			}

		close(devnull);
	}
	return 0;
fail:
	if (devnull != -1)
		close(devnull);

	base_fini();

	return -1;
}

static int base_fini()
{
	if (instance.redirector->fini)
		instance.redirector->fini();

	free(instance.chroot);
	free(instance.user);
	free(instance.group);
	free(instance.redirector_name);
	free(instance.log_name);

	memset(&instance, 0, sizeof(instance));

	return 0;
}

app_subsys base_subsys =
{
	.init = base_init,
	.fini = base_fini,
	.conf_section = &base_conf_section,
};

/* vim:set tabstop=4 softtabstop=4 shiftwidth=4: */
/* vim:set foldmethod=marker foldlevel=32 foldmarker={,}: */

217

218

219

220

221

222

223

224

225

226

227

228

229

230

#ifdef USE_IPTABLES

static int getdestaddr_iptables(int fd, const struct sockaddr_in *client, const struct sockaddr_in *bindaddr, struct sockaddr_in *destaddr)

{

socklen_t socklen = sizeof(*destaddr);

int error;

error = getsockopt(fd, SOL_IP, SO_ORIGINAL_DST, destaddr, &socklen);

if (error) {

log_errno(LOG_WARNING, "getsockopt");

return -1;

}

return 0;

}

#endif

问题所在

假定有以下的网络架构：

正常情况下，NAT与getsockopt(…, SO_ORIGINAL_DST, …)调用程序都运行在Router上，工作正常。

现在把getsockopt(…, SO_ORIGINAL_DST, …)调用程序迁移到Server 1上，很容易想到的，就是把原来的NAT规则改成DNAT的，并指定目的地IP与端口：

iptables -t nat ... -p tcp ... -j DNAT --to 192.168.1.2:GETSOCKOPT_PROGRAM_LISTEN_PORT

1	iptables -t nat ... -p tcp ... -j DNAT --to 192.168.1.2:GETSOCKOPT_PROGRAM_LISTEN_PORT

NAT的记录由内核管理，这隐含的意思是，只有在发起NAT处理的内核上才能获取到NAT前的信息。

若调用远程的getsockopt(…, SO_ORIGINAL_DST, …)调用程序，即发起NAT的与运行getsockopt(…, SO_ORIGINAL_DST, …)调用程序的不是同一个内核，很明显，这些程序对getsockopt()的调用不可能实现他们的目的。

问题已很明显，这类程序无法通过getsockopt()调用取得真实目的地。

实现getsockopt()的远程调用成本有点高，不太现实。

不过，我们把方式稍作改变，让“远程”变为“本地”。

实现

假定有以下的网络架构：

Server 1上运行着getsockopt(…, SO_ORIGINAL_DST, …)调用程序。

既然处理NAT要与调用getsockopt()的使用同一个内核，那么把需要NAT的那部分也交给Server 1处理好了。

要实现这种需求，可以利用策略路由，正常的数据包按正常处理，原本需要NAT到Server 1的数据包，使用路由规则转发到Server 1。这样就不需要网内其余联网设备作任何改变，只需要对Router与Server 1进行操作。

准备

Router 1上首先把针对getsockopt(…, SO_ORIGINAL_DST, …)调用程序的NAT规则删掉。

路由表

修改/etc/iproute2/rt_tables增加一个路由表，如12行，增加一个ID为250，名为server1的路由表：

#
# reserved values
#
255	local
254	main
253	default
0	unspec
#
# local
#
#1	inr.ruhep
250	server1

12	250 server1

往server1路由表中增加一条默认路由规则，把所有数据包都转发到Server 1：

ip route add default via 192.168.1.2 table server1

1	ip route add default via 192.168.1.2 table server1

数据包分类

接下来需要把数据包分类，即决定哪些数据包应使用路由表server1。

我这里选择iptables的MARK模块，把所有需要给getsockopt(…, SO_ORIGINAL_DST, …)调用程序处理的数据包，都加一个固定标记，然后让路由让有该特定标记的数据包都根据路由表server1处理。

iptables MARK模块的–set-mark操作仅能在mangle表完成。

假设MARK为150（把…改成你需要的参数）：

iptables -t mangle ...  -j MARK --set-mark 150

1	iptables -t mangle ... -j MARK --set-mark 150

这里需要注意，Server 1访问国际互联网，也要通过Router，如果getsockopt(…, SO_ORIGINAL_DST, …)调用程序访问互联网会匹配到上面的规则，必须排除，否则就有环路了，最简单的，就是直接排除掉Server 1的MAC地址：

iptables -t mangle ... -m mac --mac-source SERVER1_MAC -j RETURN

1	iptables -t mangle ... -m mac --mac-source SERVER1_MAC -j RETURN

这条排除Server 1的规则需要插入在MARK之前。

最后修改路由策略，让带150 MARK的数据包都使用路由表server1：

ip rule add fwmark 150 table server1

1	ip rule add fwmark 150 table server1

Server 1的处理

数据包已在Router处分类，并不经网络层和传输层Header的修改转发到Server 1处。

Server 1要区分这些数据包是要交给getsockopt(…, SO_ORIGINAL_DST, …)调用程序，还是交给自己的程序处理，很简单，看看数据包目的IP是不是自己就行了，要交给getsockopt(…, SO_ORIGINAL_DST, …)调用程序的数据包，目的IP肯定不是Server 1的IP：

iptables -t nat ... ! -d 192.168.1.2 -p tcp -j REDIRECT --to GETSOCKOPT_PROGRAM_LISTEN_PORT

1	iptables -t nat ... ! -d 192.168.1.2 -p tcp -j REDIRECT --to GETSOCKOPT_PROGRAM_LISTEN_PORT

总结

这个方法仅适用于Router与Server 1处于同一局域网的情况，因为路由转发仅能在链路层上进行，若需要跨网域调用，可能要考虑使用隧道封装，但这样开销有点大。

Router一般都在芯片上实现了NAT，NAT效率高，Server 1虽然没芯片级的NAT，但我相信，你处理被NAT的数据包的程序，开销比NAT大得多。

Redsocks/ss-redir的实现

问题所在

实现

准备

路由表

数据包分类

Server 1的处理

总结

You might also like: