def _fetch_documentation(version, base_url="https://spark.apache.org/docs"):
doc_urls = [
"{base_url}/{version}/configuration.html",
"{base_url}/{version}/sql-programming-guide.html",
"{base_url}/{version}/monitoring.html",
"{base_url}/{version}/spark-standalone.html",
"{base_url}/{version}/running-on-mesos.html",
"{base_url}/{version}/running-on-yarn.html",
]
for url in doc_urls:
doc_url = url.format(version=version, base_url=base_url)
# print(url)
print("Loading spark properties from %s", doc_url)
dfs = pd.read_html(doc_url, header=0)
desired_cols = ["Property Name", "Default", "Meaning"]
for df in dfs:
if ("Property Name" in df) and ('Default' in df):
for pn, default, desc in df[desired_cols].itertuples(index=False):
if type(default) == numpy.bool_:
default = bool(default)
yield pn, default, desc
评论列表
文章目录