ansible1.9.6源码分析
ansible-1.9.6源码分析
开篇说明
ansible-1.9.6是ansible 1的最后一个版本,之后ansible项目组重写了ansible工程,并在很多方面进行了优化,同时也新增了不少全新的功能。但是ansible 2在其核心思想上并没有改变,同样是将模块代码组合成执行的python代码,通过ssh上传python文件到远端主机并执行,最后组装返回结果并打印。现在我们将借助一个playbook的例子,研究ansible-1.9.6的执行流程,之后我们将进入ansible 2的源码学习,并认真研究每一个版本的改进代码,学习查找并修复ansible源码的bug,争取在2020年能为ansible社区找到1-5个bug,完成自己的第一次开源贡献。
我准备了一个playbook例子,比较简单,就涉及了几个常用的模块,具体的yaml文件内容如下:
---
- hosts: ceph1
tasks:
- name: get hostname
shell: 'hostname'
register: shell_out
- name: debug message
debug:
msg: "hello, {
{ shell_out.stdout }}"
- name: set facts
set_fact:
host_name: "{
{ shell_out.stdout }}"
- name: create new directory
file:
path: '~/create_test'
state: directory
- name: upload file
template:
src: '/home/shen/Desktop/upload_file.txt.j2'
dest: '~/create_test/upload_file.txt'
调式多节点时,会出现错误,为方便起见,我们使用单个节点作为目标节点进行调试。上面一共有5个tasks,分别用到了shell、debug、set_fact、file和template模块。现在我们在pycharm里面设置执行命令ansible-playbook -i hosts test.yml,由于代码量比较多,我们在跟踪时候会略去比较多的细节代码,只关注核心执行过程。
首先执行ansible-playbook命令文件,具体代码如下:
# 导入依赖模块
...
# 两个打印显示颜色的辅助函数
...
def main(args):
''' run ansible-playbook operations '''
# 解析命令参数
...
options, args = parser.parse_args(args)
if len(args) == 0:
parser.print_help(file=sys.stderr)
return 1
# 设置一些变量
...
# 处理是否需要加密文件
...
# 检查playbook文件和目标hosts
...
for playbook in args:
# 忽略细节变量处理
...
# 非常关键
pb = ansible.playbook.PlayBook(
playbook=playbook,
module_path=options.module_path,
inventory=inventory,
forks=options.forks,
remote_user=options.remote_user,
remote_pass=sshpass,
callbacks=playbook_cb,
runner_callbacks=runner_cb,
stats=stats,
timeout=options.timeout,
transport=options.connection,
become=options.become,
become_method=options.become_method,
become_user=options.become_user,
become_pass=becomepass,
extra_vars=extra_vars,
private_key_file=options.private_key_file,
only_tags=only_tags,
skip_tags=skip_tags,
check=options.check,
diff=options.diff,
vault_password=vault_pass,
force_handlers=options.force_handlers,
)
# 检查playbook文件中的变量以及是否包含错误情况
...
try:
# 最核心处理过程,这样会执行playbook中的tasks,并得到执行结果
pb.run()
# 打印最后的执行结果
...
except errors.AnsibleError, e:
# 打印错误输出
...
return 1
return 0
if __name__ == "__main__":
display(" ", log_only=True)
display(" ".join(sys.argv), log_only=True)
display(" ", log_only=True)
try:
sys.exit(main(sys.argv[1:]))
except errors.AnsibleError, e:
# 打印错误信息
...
sys.exit(1)
except KeyboardInterrupt, ke:
display("ERROR: interrupted", color='red', stderr=True)
sys.exit(1)
现在我们的目标,转向lib/ansible/playbook下的源码文件,这里一共有3个文件,分别为__init__.py、play.py和task.py。其中__init__.py文件中定义了核心的PlayBook类,play.py和task.py中分别定义了核心的Play类和Task类。下面先学习Task类,定义在task.py文件中。
# 导入相应模块,略
...
class Task(object):
# 属性定义,略
...
def __init__(self, play, ds, module_vars=None, play_vars=None, play_file_vars=None, role_vars=None, role_params=None, default_vars=None, additional_conditions=None, role_name=None, no_tags=True):
# 解析元数据信息
...
# 设置library库路径,后续自定义模块的python代码将放到此路径下
library = os.path.join(play.basedir, 'library')
if os.path.exists(library):
utils.plugins.module_finder.add_directory(library)
"""
将一个task的第一层属性取出来分析,比如下面一个task:
- name: test hello
shell: echo hello
这里ds时将这个task转成字典形式:
ds = {'name': 'test hello', 'shell': 'echo hello'}
"""
for x in ds.keys():
if x in utils.plugins.module_finder:
if 'action' in ds:
# 不允许既出现模块(比如shell模块),又有action
raise errors.AnsibleError("...")
if isinstance(ds[x], dict):
# ds[x]是模块参数,已经是字典形式
if 'args' in ds:
# 不能继续在ds中出现args参数
raise errors.AnsibleError("...")
# 修改模块参数的key
ds['args'] = ds[x]
ds[x] = ''
elif ds[x] is None:
ds[x] = ''
if not isinstance(ds[x], basestring):
raise errors.AnsibleError("")
# 将模块和参数组合起来
ds['action'] = x + " " + ds[x]
ds.pop(x)
elif x.startswith("with_"):
# 处理with_循环情况
if isinstance(ds[x], basestring):
param = ds[x].strip()
plugin_name = x.replace("with_","")
"""
这里会判断with_后面的name,支持的有items(对应
items.py),dict(对应dict.py)等,可以查看目录
lib/ansible/runner/lookup_plugins下对应的python文
件,就知道支持with_的多少种写法,一种写法对应一个
python文件,这样的写法在ansible中经常使用
"""
if plugin_name in utils.plugins.lookup_loader:
ds['items_lookup_plugin'] = plugin_name
ds['items_lookup_terms'] = ds[x]
ds.pop(x)
else:
raise errors.AnsibleError("")
elif x in [ 'changed_when', 'failed_when', 'when']:
# 处理包含changed_when、failed_when和when的情况
...
elif x.startswith("when_"):
# 用when_的形式,这个我不太会用
...
if 'when' in ds:
# 不允许有多个when的情况
raise errors.AnsibleError("...")
when_name = x.replace("when_","")
ds['when'] = "%s %s" % (when_name, ds[x])
ds.pop(x)
elif not x in Task.VALID_KEYS:
raise errors.AnsibleError("")
# 类的属性赋值
...
# 类的其它属性赋值
self.name = ds.get('name', None)
self.register = ds.get('register', None)
self.environment = ds.get('environment', play.environment)
self.role_name = role_name
self.no_log = utils.boolean(ds.get('no_log', "false")) or self.play.no_log
# 执行一次标识
self.run_once = utils.boolean(ds.get('run_once', 'false'))
if 'until' in ds:
if not ds.get('register'):
# 使用until时,必须要有register
raise errors.AnsibleError("...")
self.module_vars['delay'] = ds.get('delay', 5)
self.module_vars['retries'] = ds.get('retries', 3)
self.module_vars['register'] = ds.get('register', None)
self.until = ds.get('until')
self.module_vars['until'] = self.until
self.args = ds.get('args', {
})
# get remote_user for task, then play, then playbook
...
# 处理become相关
...
# 处理su、sudo_user、sudo_pass等属性
...
# Both are defined
if ('action' in ds) and ('local_action' in ds):
raise errors.AnsibleError("...")
# Both are NOT defined
elif (not 'action' in ds) and (not 'local_action' in ds):
raise errors.AnsibleError("...")
# Only one of them is defined
elif 'local_action' in ds:
self.action = ds.get('local_action', '')
self.delegate_to = '127.0.0.1'
else:
self.action = ds.get('action', '')
self.delegate_to = ds.get('delegate_to', None)
self.transport = ds.get('connection', ds.get('transport', play.transport))
# 模块参数检查
...
# 处理delegate_to参数
if not (self.delegate_to is None):
# delegate_to: localhost should use local transport
if self.delegate_to in ['127.0.0.1', 'localhost']:
self.transport = 'local'
if self.name is None:
self.name = self.action
# load various attributes
self.when = ds.get('when', None)
self.changed_when = ds.get('changed_when', None)
self.failed_when = ds.get('failed_when', None)
# 处理各种变量,这在进行模板变量翻译时会用到
all_vars = self.default_vars.copy()
all_vars = utils.combine_vars(all_vars, self.play_vars)
all_vars = utils.combine_vars(all_vars, self.play_file_vars)
all_vars = utils.combine_vars(all_vars, self.role_vars)
all_vars = utils.combine_vars(all_vars, self.module_vars)
all_vars = utils.combine_vars(all_vars, self.role_params)
# 默认非异步
self.async_seconds = ds.get('async', 0)
self.async_seconds = template.template_from_string(play.basedir, self.async_seconds, all_vars)
self.async_seconds = int(self.async_seconds)
# 默认后台执行任务时间,10s,可以在配置中修改poll值
self.async_poll_interval = ds.get('poll', 10)
self.async_poll_interval = template.template_from_string(play.basedir, self.async_poll_interval, all_vars)
self.async_poll_interval = int(self.async_poll_interval)
# 回调通知
self.notify = ds.get('notify', [])
self.first_available_file = ds.get('first_available_file', None)
self.items_lookup_plugin = ds.get('items_lookup_plugin', None)
self.items_lookup_terms = ds.get('items_lookup_terms', None)
# 其他一些支持的属性,比如ignore_errors,是否忽略本次任务的错误
self.ignore_errors = ds.get('ignore_errors', False)
self.any_errors_fatal = ds.get('any_errors_fatal', play.any_errors_fatal)
self.always_run = ds.get('always_run', False)
# 这个就是获取执行动作的模块,比如shell,file等模块
if not isinstance(self.action, basestring):
raise errors.AnsibleError("...")
# 完成本次任务后通知执行
if isinstance(self.notify, basestring):
self.notify = [ self.notify ]
try:
tokens = split_args(self.action)
except Exception, e:
# 打印异常
...
if len(tokens) < 1:
# tokens为空,抛出异常,第一个通常为模块
raise errors.AnsibleError("...")
# 获取本次任务执行的模块名
self.module_name = tokens[0]
# 提取模块参数,可能和模块一起,也可能是单独的属性
self.module_args = ''
if len(tokens) > 1:
self.module_args = " ".join(tokens[1:])
# 处理不正确的选项
incompatibles = [ x for x in [ self.first_available_file, self.items_lookup_plugin ] if x is not None ]
if len(incompatibles) > 1:
raise errors.AnsibleError('...')
# make first_available_file accessible to Runner code
...
if self.items_lookup_plugin is not None:
self.module_vars['items_lookup_plugin'] = self.items_lookup_plugin
self.module_vars['items_lookup_terms'] = self.items_lookup_terms
# allow runner to see delegate_to option
self.module_vars['delegate_to'] = self.delegate_to
# make some task attributes accessible to Runner code
self.module_vars['ignore_errors'] = self.ignore_errors
self.module_vars['register'] = self.register
self.module_vars['changed_when'] = self.changed_when
self.module_vars['failed_when'] = self.failed_when
self.module_vars['always_run'] = self.always_run
self.tags = self._load_tags(ds, self.module_vars)
if additional_conditions:
new_conditions = additional_conditions[:]
if self.when:
new_conditions.append(self.when)
self.when = new_conditions
def _load_tags(self, ds, module_vars):
# 先忽略,非核心
...
Task类代表了playbook文件中一次执行的任务,它在__init__中解析了yaml文件的task任务。从源代码中,我们也可以看到ansible的playbook文件中task的写法支持许多操作,比如when,with_items、changed_when、failed_when、async、poll、notify、ignore_errors等等。仔细阅读前面的代码,初始化代码要考虑非常多的情况,解析各种可能出现的正常或者异常情况,所以内容比较多。我已经简化了源码,也做好了部分的注释。
接下来,我们学习另外一个核心的类:Play,具体代码如下:
# 导入模块,略
...
class Play(object):
# 定义部分属性,略
...
def __init__(self, playbook, ds, basedir, vault_password=None):
# ds是任务列表
for x in ds.keys():
if not x in Play.VALID_KEYS:
raise errors.AnsibleError("%s is not a legal parameter of an Ansible Play" % x)
# 处理playbook相关参数
...
# 处理其他相关属性
...
# 最核心的两行代码,调用方法处理所有tasks和handlers
self._tasks = self._load_tasks(self._ds.get('tasks', []), load_vars)
self._handlers = self._load_tasks(self._ds.get('handlers', []), load_vars)
# apply any missing tags to role tasks
self._late_merge_role_tags()
# place holder for the discovered hosts to be used in this play
self._play_hosts = None
def _get_role_path(self, role):
...
def _build_role_dependencies(self, roles, dep_stack, passed_vars={
}, level=0):
...
def _load_role_vars_files(self, vars_files):
...
def _load_role_defaults(self, defaults_files):
...
def _load_roles(self, roles, ds):
...
def _resolve_main(self, basepath):
...
# 核心处理task的函数
def _load_tasks(self, tasks, vars=None, role_params=None, default_vars=None, become_vars=None,
additional_conditions=None, original_file=None, role_name=None):
...
def tasks(self):
''' return task objects for this play '''
return self._tasks
def handlers(self):
''' return handler objects for this play '''
return self._handlers
# 剩余处理变量相关的函数
...
现在回过头来继续跟踪pb.run()
方法执行的过程。
# 省略导入和全局变量
...
class PlayBook(object):
# 省略部分函数
...
# run方法
def run(self):
''' run all patterns in the playbook '''
plays = []
matched_tags_all = set()
unmatched_tags_all = set()
self.callbacks.on_start()
for (play_ds, play_basedir) in zip(self.playbook, self.play_basedirs):
...
# 省略一些不重要代码
...
# 核心部分,处理每个剧本,也就是对应的每个yaml文件
for play in plays:
ansible.callbacks.set_play(self.callbacks, play)
ansible.callbacks.set_play(self.runner_callbacks, play)
# 核心的处理函数,执行剧本中的所有任务
if not self._run_play(play):
break
# 回调处理,整理结果
ansible.callbacks.set_play(self.callbacks, None)
ansible.callbacks.set_play(self.runner_callbacks, None)
# summarize the results
results = {
}
for host in self.stats.processed.keys():
results[host] = self.stats.summarize(host)
return results
# 忽略其他函数
....
继续在这个类中追踪self._run_play(play)
方法,这个部分处理代码有些大,请耐心阅读。
class PlayBook(object):
...
def _run_play(self, play):
''' run a list of tasks for a given pattern, in order '''
self.callbacks.on_play_start(play.name)
# 获取要执行该剧本的主机
play._play_hosts = self.inventory.list_hosts(play.hosts)
# 如果没有目标主机,直接返回
if not play._play_hosts:
self.callbacks.on_no_hosts_matched()
return True
"""
这一步就是默认要执行的收集主机信息的代码,可以通过设置
gather_facts为false则可以不收集目标主机信息,而这个函数
就是执行setup模块完成的
"""
self._do_setup_step(play)
all_hosts = self._trim_unavailable_hosts(play._play_hosts)
play.update_vars_files(all_hosts, vault_password=self.vault_password)
hosts_count = len(all_hosts)
# 忽略部分细节代码
...
task_errors = False
for on_hosts in serialized_batch:
play._play_hosts = self._trim_unavailable_hosts(on_hosts)
self.inventory.also_restrict_to(on_hosts)
for task in self.tasks_to_run_in_play(play):
...
# 执行每一个task的核心处理函数
if not self._run_task(play, task, False):
return False
host_list = self._trim_unavailable_hosts(play._play_hosts)
# Set max_fail_pct to 0, So if any hosts fails, bail out
if task.any_errors_fatal and len(host_list) < hosts_count:
play.max_fail_pct = 0
# If threshold for max nodes failed is exceeded, bail out.
if play.serial > 0:
# if serial is set, we need to shorten the size of host_count
play_count = len(play._play_hosts)
if (play_count - len(host_list)) > int((play.max_fail_pct)/100.0 * play_count):
host_list = None
else:
if (hosts_count - len(host_list)) > int((play.max_fail_pct)/100.0 * hosts_count):
host_list = None
# if no hosts remain, drop out
if not host_list:
if play.force_handlers:
task_errors = True
break
else:
self.callbacks.on_no_hosts_remaining()
return False
# lift restrictions after each play finishes
self.inventory.lift_also_restriction()
if task_errors and not play.force_handlers:
# if there were failed tasks and handler execution
# is not forced, quit the play with an error
return False
elif task_errors:
# if there were failed tasks and handler execution is forced,
# execute all handlers and quit the play with an error
self.run_handlers(play)
return False
else:
# no errors, go ahead and execute all handlers
if not self.run_handlers(play):
return False
...
def _do_setup_step(self, play):
''' get facts from the remote system '''
# 获取收集系统信息的主机列表
host_list = self._trim_unavailable_hosts(play._play_hosts)
if play.gather_facts is None and C.DEFAULT_GATHERING == 'smart':
# 默认收集主机的系统信息,
host_list = [h for h in host_list if h not in self.SETUP_CACHE or 'module_setup' not in self.SETUP_CACHE[h]]
if len(host_list) == 0:
return {
}
elif play.gather_facts is False or (play.gather_facts is None and C.DEFAULT_GATHERING == 'explicit'):
# 如果yaml文件中设置了不收集属性,或者C.DEFAULT_GATHERING设置为explicit,则直接返回{}
return {
}
self.callbacks.on_setup()
self.inventory.restrict_to(host_list)
ansible.callbacks.set_task(self.callbacks, None)
ansible.callbacks.set_task(self.runner_callbacks, None)
# 调用setup模块执行
setup_results = ansible.runner.Runner(
basedir=self.basedir,
pattern=play.hosts,
module_name='setup',
module_args={
},
inventory=self.inventory,
forks=self.forks,
module_path=self.module_path,
timeout=self.timeout,
remote_user=play.remote_user,
remote_pass=self.remote_pass,
remote_port=play.remote_port,
private_key_file=self.private_key_file,
setup_cache=self.SETUP_CACHE,
vars_cache=self.VARS_CACHE,
callbacks=self.runner_callbacks,
become=play.become,
become_method=play.become_method,
become_user=play.become_user,
become_pass=self.become_pass,
vault_pass=self.vault_password,
transport=play.transport,
is_playbook=True,
module_vars=play.vars,
play_vars=play.vars,
play_file_vars=play.vars_file_vars,
role_vars=play.role_vars,
default_vars=play.default_vars,
check=self.check,
diff=self.diff,
accelerate=play.accelerate,
accelerate_port=play.accelerate_port,
).run()
# 统计结果,并封装成相应的格式,返回
self.stats.compute(setup_results, setup=True)
self.inventory.lift_restriction()
# now for each result, load into the setup cache so we can
# let runner template out future commands
setup_ok = setup_results.get('contacted', {
})
for (host, result) in setup_ok.iteritems():
utils.update_hash(self.SETUP_CACHE, host, {
'module_setup': True})
utils.update_hash(self.SETUP_CACHE, host, result.get('ansible_facts', {
}))
return setup_results
官方代码注释也比较详细,最后落到执行每个task,就是self._run_task()
这个方法。接下来,继续看追踪这个方法,代码如下:
class PlayBook(object):
...
def _run_task(self, play, task, is_handler):
''' run a single task in the playbook and recursively run any subtasks. '''
ansible.callbacks.set_task(self.callbacks, task)
ansible.callbacks.set_task(self.runner_callbacks, task)
if task.role_name:
name = '%s | %s' % (task.role_name, task.name)
else:
name = task.name
try:
name = template(play.basedir, name, task.module_vars, lookup_fatal=False, filter_fatal=False)
except:
pass
self.callbacks.on_task_start(name, is_handler)
if hasattr(self.callbacks, 'skip_task') and self.callbacks.skip_task:
ansible.callbacks.set_task(self.callbacks, None)
ansible.callbacks.set_task(self.runner_callbacks, None)
return True
cond = template(play.basedir, task.ignore_errors, task.module_vars, expand_lists=False)
# 检查是否需要忽略本次执行的错误
task.ignore_errors = utils.check_conditional(cond, play.basedir, task.module_vars, fail_on_undefined=C.DEFAULT_UNDEFINED_VAR_BEHAVIOR)
include_failed = is_handler and play.force_handlers
# 继续调用处理任务,得到结果##########################
results = self._run_task_internal(task, include_failed=include_failed)
##################################################
hosts_remaining = True
if results is None:
hosts_remaining = False
results = {
}
contacted = results.get('contacted', {
})
self.stats.compute(results, ignore_errors=task.ignore_errors)
# 忽略其他函数
...
继续追踪self._run_task_internal()
方法的代码,如下:
class PlayBook(object):
...
def _run_task_internal(self, task, include_failed=False):
''' run a particular module step in a playbook '''
hosts = self._trim_unavailable_hosts(self.inventory.list_hosts(task.play._play_hosts), keep_failed=include_failed)
self.inventory.restrict_to(hosts)
runner = ansible.runner.Runner(
pattern=task.play.hosts,
inventory=self.inventory,
module_name=task.module_name,
module_args=task.module_args,
forks=self.forks,
remote_pass=self.remote_pass,
module_path=self.module_path,
timeout=self.timeout,
remote_user=task.remote_user,
remote_port=task.play.remote_port,
module_vars=task.module_vars,
play_vars=task.play_vars,
play_file_vars=task.play_file_vars,
role_vars=task.role_vars,
role_params=task.role_params,
default_vars=task.default_vars,
extra_vars=self.extra_vars,
private_key_file=self.private_key_file,
setup_cache=self.SETUP_CACHE,
vars_cache=self.VARS_CACHE,
basedir=task.play.basedir,
conditional=task.when,
callbacks=self.runner_callbacks,
transport=task.transport,
is_playbook=True,
check=self.check,
diff=self.diff,
environment=task.environment,
complex_args=task.args,
accelerate=task.play.accelerate,
accelerate_port=task.play.accelerate_port,
accelerate_ipv6=task.play.accelerate_ipv6,
error_on_undefined_vars=C.DEFAULT_UNDEFINED_VAR_BEHAVIOR,
vault_pass = self.vault_password,
run_hosts=hosts,
no_log=task.no_log,
run_once=task.run_once,
become=task.become,
become_method=task.become_method,
become_user=task.become_user,
become_pass=task.become_pass,
)
runner.module_vars.update({
'play_hosts'