1
0
mirror of https://github.com/openshift/openshift-docs.git synced 2026-02-05 12:46:18 +01:00
Files
openshift-docs/build_for_portal.py

1261 lines
45 KiB
Python

#!/usr/bin/python
# see notes in the build.py script as to what this file does
import argparse
import configparser
import filecmp
import fnmatch
import os
import re
import shutil
import subprocess
import sys
import tempfile
import time
import requests
import yaml
import logging
# See manual and pip3 install aura.tar.gz for logging
from aura import cli
cli.init_logging(False, True)
list_of_errors = []
CLONE_DIR = "."
BASE_PORTAL_URL = "https://docs.redhat.com/en/documentation/"
# ID_RE = re.compile("^\[(?:\[|id=\'|#)(.*?)(\'?,.*?)?(?:\]|\')?\]", re.M | re.DOTALL)
ID_RE = re.compile(
"^\[(?:\[|id='|#|id=\")(.*?)('?,.*?)?(?:\]|'|\")?\]", re.M | re.DOTALL
)
LINKS_RE = re.compile(
"(?:xref|link):([\./\w_-]*/?[\w_.-]*\.(?:html|adoc))?(#[\w_-]*)?(\[.*?\])",
re.M | re.DOTALL,
)
EXTERNAL_LINK_RE = re.compile(
"[\./]*([\w_-]+)/[\w_/-]*?([\w_.-]*\.(?:html|adoc))", re.DOTALL
)
INCLUDE_RE = re.compile(r"^include::(.*?)\[(.*?)\]", re.M)
IFDEF_RE = re.compile(r"^if(n?)def::(.*?)\[\]", re.M)
ENDIF_RE = re.compile(r"^endif::(.*?)\[\]\r?\n", re.M)
COMMENT_CONTENT_RE = re.compile(r"^^////$.*?^////$", re.M | re.DOTALL)
COMMENTED_XREF_RE = re.compile(r"^//.*xref:.*$")
TAG_CONTENT_RE = re.compile(
r"//\s+tag::(.*?)\[\].*?// end::(.*?)\[\]", re.M | re.DOTALL
)
CMP_IGNORE_FILES = [".git", ".gitignore", "README.md", "build.cfg"]
DEVNULL = open(os.devnull, "wb")
MASTER_FILE_BASE = "= {title}\n\
:product-author: {product-author}\n\
:product-title: {product}\n\
:product-version: {product-version}\n\
:{distro}:\n\
:imagesdir: images\n\
:idseparator: -\n\
{preface-title}\n"
DOCINFO_BASE = '<title>{title}</title>\n\
<productname>{{product-title}}</productname>\n\
<productnumber>{{product-version}}</productnumber>\n\
<subtitle>Enter a short description here.</subtitle>\n\
<abstract>\n\
<para>A short overview and summary of the book\'s subject and purpose, traditionally no more than one paragraph long.</para>\n\
</abstract>\n\
<authorgroup>\n\
<orgname>{product-author}</orgname>\n\
</authorgroup>\n\
<xi:include href="Common_Content/Legal_Notice.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />\n'
# A list of book titles, that still use the old drupal url format (ie includes the product/version in the book title part)
# eg. openshift-enterprise/version-3.0/openshift-enterprise-30-getting-started vs openshift-enterprise/version-3.0/getting-started
DRUPAL_OLD_URL_TITLES = [
"Administrator Guide",
"Architecture",
"CLI Reference",
"Creating Images",
"Developer Guide",
"Getting Started",
"REST API Reference",
"Using Images",
"What's New?",
]
# A mapping of upstream book/category names to CP book names
BOOK_NAME_OVERRIDES = {"Administration": "Administrator Guide"}
# Lines that should be stripped out/ignored when cleaning the content
IGNORE_LINES = [
"{product-author}\n",
"{product-version}\n",
"{product-version]\n",
"{Lucas Costi}\n",
"toc::[]\n",
]
# Each MACRO in this list is omitted from the output
# if the input appears as ':MACRO:' (colon, MACRO, colon).
IGNORE_MACROS = ["description", "keywords", "icons", "data-uri", "toc", "toc-title"]
# Files where the title should be removed when building the all-in-one
ALL_IN_ONE_SCRAP_TITLE = ["welcome/index.adoc"]
# Files that should be commented out in the toc structure
COMMENT_FILES = [
"admin_guide/overview.adoc",
"creating_images/overview.adoc",
"dev_guide/overview.adoc",
"using_images/overview.adoc",
"rest_api/overview.adoc",
]
# Map FILENAME to a map of TITLE to ID. In most of the cases the
# ID is the TITLE downcased, with "strange" chars replaced by hyphen.
# A notable exception is 'any' TITLE.
TITLE_IDS = {}
# A dictionary of existing dup ids to new unique ids
DUPLICATE_IDS = {}
# Map FILENAME to a map of BAD to GOOD. Most of the time, BAD and GOOD
# are in link syntax, i.e., beginning with "link:", but not always.
INCORRECT_LINKS = {}
log = logging.getLogger("build")
def setup_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--distro", help="The distribution to build for", default="openshift-enterprise"
)
parser.add_argument("--all-in-one", help=argparse.SUPPRESS, action="store_true")
parser.add_argument("--title", help=argparse.SUPPRESS, default="Documentation")
parser.add_argument("--product", default="OpenShift Enterprise")
parser.add_argument("--version", default="3.0")
parser.add_argument("--author", default="Red Hat OpenShift Documentation Team")
parser.add_argument(
"--upstream-url",
help="The upstream source url",
default="https://github.com/openshift/openshift-docs.git",
)
parser.add_argument(
"--upstream-branch", help="The upstream source branch", default="enterprise-3.0"
)
parser.add_argument(
"--branch", help="The GitLab branch to commit changes into", default="GA"
)
parser.add_argument(
"-p",
"--push",
help="Commit and push the changes into GitLab",
action="store_true",
)
parser.add_argument(
"--no-clean",
help="Don't clean the drupal-build directory before building",
action="store_true",
)
parser.add_argument(
"--no-upstream-fetch",
help="Don't fetch the upstream sources",
action="store_true",
)
return parser
def find_build_config_file():
"""
Finds the build config file to use, as it might be _topic_map.yml or _build_cfg.yml
"""
# updated 23rd Nov to support files in _topic_maps folder
# load everything from the _topic_maps folder
file_list = os.listdir(os.path.join(CLONE_DIR, "_topic_maps"))
# create a temp file combining all values from that folder
# don't delete it immediately, and give it a suffix of swp which makes it ignored by git
with tempfile.NamedTemporaryFile(dir=CLONE_DIR, delete=False, suffix=".swp") as tmp:
for f in file_list:
with open(os.path.join(CLONE_DIR, "_topic_maps", f), "rb") as infile:
tmp.write(infile.read())
config = os.path.abspath(tmp.name)
log.info(config)
# backup look for a single _topic_map in the cloned directory
if not os.path.isfile(config):
config = os.path.abspath(os.path.join(CLONE_DIR, "_topic_map.yml"))
return config
def parse_build_config(config):
"""
Parses the build config and returns a tree based structure for the config.
"""
config = os.path.expanduser(config)
with open(config, "r") as f:
data = list(yaml.load_all(f, Loader=yaml.FullLoader))
for book in data:
book_name = book["Name"]
if book_name in BOOK_NAME_OVERRIDES:
book["Name"] = BOOK_NAME_OVERRIDES[book_name]
return data
def iter_tree(
node,
distro,
dir_callback=None,
topic_callback=None,
include_path=True,
parent_dir="",
depth=0,
):
"""
Iterates over a build config tree starting from a specific node, skipping content where the distro doesn't match.
Additionally, calls are made to the dir_callback or topic_callback functions when a directory or topic is found.
"""
if "Topics" in node:
if check_node_distro_matches(node, distro):
if include_path:
topics_dir = os.path.join(parent_dir, node["Dir"])
else:
topics_dir = ""
if dir_callback is not None:
dir_callback(node, parent_dir, depth)
for topic in node["Topics"]:
iter_tree(
topic,
distro,
dir_callback,
topic_callback,
True,
topics_dir,
depth + 1,
)
elif check_node_distro_matches(node, distro):
if topic_callback is not None:
topic_callback(node, parent_dir, depth)
def check_node_distro_matches(node, distro):
"""
Checks to see if the specified distro matches a distro in the nodes distros list. If there is no distros list specified on the
node then all distros are allowed, so return true.
"""
if "Distros" not in node:
return True
else:
node_distros = [x.strip() for x in node["Distros"].split(",")]
for node_distro in node_distros:
# Check for an exact match, or a glob match
if node_distro == distro or fnmatch.fnmatchcase(distro, node_distro):
return True
return False
def ensure_directory(directory):
"""
Creates DIRECTORY if it does not exist.
"""
if not os.path.exists(directory):
os.makedirs(directory)
def expand_huge_books(info):
"""
Finds nodes for huge books, creates new nodes for books from their top-level topics,
and then removes the nodes for huge books
"""
# find all the huge books, sa defined by nodes
huge_book_nodes = [book for book in info["book_nodes"]
if os.path.exists(os.path.join(info["src_dir"],book["Dir"],"hugeBook.flag")) ]
for book in huge_book_nodes:
# save the directory in info
huge_book_dir = book["Dir"]
info["huge_book_dirs"].append(huge_book_dir)
# create the flag file in the book destination directory
book_dest_dir = os.path.join(info["dest_dir"], book["Dir"])
ensure_directory(book_dest_dir)
with open(os.path.join(book_dest_dir,"hugeBook.flag"),"w") as fi:
fi.write("hugebook")
# make new book nodes for the second-level headings
for topic in book["Topics"]:
if "Dir" in topic.keys():
info["book_nodes"].append(topic)
topic["Dir"] = huge_book_dir + "/" + topic["Dir"]
# remove book nodes for huge books
for node_to_remove in huge_book_nodes:
info["book_nodes"].remove(node_to_remove)
def build_master_files(info):
"""
Builds the master.adoc and docinfo.xml files for each guide specified in the config.
"""
# change the huge books into sub-books
expand_huge_books(info)
# TODO: Refactor. This does too much.
dest_dir = info["dest_dir"]
all_in_one = info["all_in_one"]
all_in_one_text = ""
for book in info["book_nodes"]:
book_dest_dir = os.path.join(dest_dir, book["Dir"])
ensure_directory(book_dest_dir)
book_info = dict(info)
book_info["title"] = book["Name"]
master: str = generate_master_entry(
book, book["Dir"], info["distro"], all_in_one, all_in_one=all_in_one
)
# Save the content
if not all_in_one:
master_file = os.path.join(book_dest_dir, "master.adoc")
docinfo_file = os.path.join(book_dest_dir, "docinfo.xml")
master_base = MASTER_FILE_BASE.format(**book_info)
log.debug("Writing " + master_file)
with open(master_file, "w") as f:
f.write(master_base + master)
log.debug("Writing " + docinfo_file)
with open(docinfo_file, "w") as f:
f.write(DOCINFO_BASE.format(**book_info))
else:
# TODO: Do we ever use this?
if all_in_one_text == "":
# Remove the title for the first file in the book
master = master.replace("= " + book["Name"] + "\n", "")
# Set the preface title from the first file in the book
first_file = os.path.join(
info["src_dir"], book["Dir"], book["Topics"][0]["File"] + ".adoc"
)
preface_title = None
with open(first_file, "r") as f:
line = f.readline()
while line:
if include_line(line):
preface_title = re.sub("^=+ ", "", line)
break
line = f.readline()
if preface_title is not None:
info["preface-title"] = ":preface-title: " + preface_title
all_in_one_text += master
# TODO: And is this ever used?
if all_in_one:
master_file = os.path.join(dest_dir, "master.adoc")
docinfo_file = os.path.join(dest_dir, "docinfo.xml")
master_base = MASTER_FILE_BASE.format(**info)
log.debug("Writing " + master_file)
with open(master_file, "w") as f:
f.write(master_base + all_in_one_text)
log.debug("Writing " + docinfo_file)
with open(docinfo_file, "w") as f:
f.write(DOCINFO_BASE.format(**info))
def generate_master_entry(
node: dict, book_dir: str, distro: str, include_name=True, all_in_one=False
):
"""
Given a node (book dict), generate content for that node's master.adoc file.
"""
master_entries = []
def dir_callback(dir_node, parent_dir, depth):
if include_name or depth > 0:
master_entries.append(
"=" * (depth + 1) + " " + dir_node["Name"].replace("\\", "")
)
def topic_callback(topic_node, parent_dir, depth):
book_file_path = os.path.join(parent_dir, topic_node["File"] + ".adoc")
file_path = os.path.join(book_dir, book_file_path)
include = "include::" + book_file_path + "[leveloffset=+" + str(depth) + "]"
if not all_in_one and file_path in COMMENT_FILES:
master_entries.append("////")
master_entries.append(include)
master_entries.append("////")
else:
master_entries.append(include)
# Add a blank line
master_entries.append("")
# Iterate over the tree and build the master.adoc content
iter_tree(node, distro, dir_callback, topic_callback, include_name)
return "\n".join(master_entries)
def reformat_for_drupal(info):
"""
Reformats the source content for use in the Customer Portal. This function does the following:
- Copies images over and flattens them into a single dir
- Copies source asciidoc over
- Filters the AsciiDoc source to remove duplicate macro definitions, that should only be in the main file.
- Adds id's for each file, so the files can be properly cross referenced.
- Adds id's to sections that are cross referenced, but have no id.
- Fixes duplicate id's in the source content.
- Fixes links that have been done incorrectly and should be cross references instead.
"""
books = info["book_nodes"]
src_dir = info["src_dir"]
dest_dir = info["dest_dir"]
distro = info["distro"]
# Build a mapping of files to ids
# Note: For all-in-one we have to collect ids from all books first
file_to_id_map = {}
if info["all_in_one"]:
book_ids = []
for book in books:
book_ids.extend(collect_existing_ids(book, distro, src_dir))
for book in books:
file_to_id_map.update(build_file_to_id_map(book, distro, book_ids, src_dir))
else:
for book in books:
book_ids = collect_existing_ids(book, distro, src_dir)
file_to_id_map.update(build_file_to_id_map(book, distro, book_ids, src_dir))
info["file_to_id_map"] = file_to_id_map
# Reformat the data
for book in books:
log.info("Processing %s", book["Dir"])
book_src_dir = os.path.join(src_dir, book["Dir"])
if info["all_in_one"]:
images_dir = os.path.join(dest_dir, "images")
else:
book_dest_dir = os.path.join(dest_dir, book["Dir"])
images_dir = os.path.join(book_dest_dir, "images")
ensure_directory(images_dir)
# ADDED 21 Jan 2025: selective processing of images
# the set of file names is to be stored in image_files
# The initial value includes images defined in attributes (to copy every time)
image_files = set()
log.debug("Copying source files for " + book["Name"])
copy_files(book, book_src_dir, src_dir, dest_dir, info, image_files)
log.debug("Copying images for " + book["Name"])
copy_images(book, src_dir, images_dir, distro, image_files)
def copy_images(node, src_path, dest_dir, distro, image_files):
"""
Copy images over to the destination directory and flatten all image directories into the one top level dir.
REWORKED 21 Jan 2025: we now assume that there is a single images directory and
that all other images subdirectories are simply symlinks into it. So we do not
iterate over the tree but simply copy the necessary files from that one images directory
"""
images_source_dir = os.path.join(src_path, "images")
for image_file_name in image_files:
image_file_pathname = os.path.join(images_source_dir,image_file_name)
if os.path.exists(image_file_pathname):
shutil.copy(image_file_pathname, dest_dir)
# if an image file is not found, this is not an error, because it might
# have been picked up from a commented-out line. Actual missing images
# should be caught by the asciidoctor/asciibinder part of CI
def copy_files(node, book_src_dir, src_dir, dest_dir, info, image_files):
"""
Recursively copy files from the source directory to the destination directory, making sure to scrub the content, add id's where the
content is referenced elsewhere and fix any links that should be cross references.
"""
def dir_callback(dir_node, parent_dir, depth):
node_dest_dir = os.path.join(dest_dir, parent_dir, dir_node["Dir"])
ensure_directory(node_dest_dir)
def topic_callback(topic_node, parent_dir, depth):
node_src_dir = os.path.join(src_dir, parent_dir)
node_dest_dir = os.path.join(dest_dir, parent_dir)
src_file = os.path.join(node_src_dir, topic_node["File"] + ".adoc")
dest_file = os.path.join(node_dest_dir, topic_node["File"] + ".adoc")
# Copy the file
copy_file(info, book_src_dir, src_file, dest_dir, dest_file, image_files)
iter_tree(node, info["distro"], dir_callback, topic_callback)
def copy_file(
info,
book_src_dir,
src_file,
dest_dir,
dest_file,
image_files,
include_check=True,
tag=None,
cwd=None,
):
"""
Copies a source file to destination, making sure to scrub the content, add id's where the content is referenced elsewhere and fix any
links that should be cross references. Also copies any includes that are referenced, since they aren't included in _build_cfg.yml.
"""
# It's possible that the file might have been created by another include, if so then just return
if os.path.isfile(dest_file):
return
# Touch the dest file, so we can handle circular includes
parent_dir = os.path.dirname(dest_file)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
# os.mknod(dest_file)
open(dest_file, "w").close()
# Scrub/fix the content
content = scrub_file(info, book_src_dir, src_file, image_files, tag=tag, cwd=cwd)
# Check for any includes
if include_check:
cleaned_content = remove_conditional_content(content, info)
include_iter = INCLUDE_RE.finditer(cleaned_content)
for include in include_iter:
include_text = include.group(0)
include_path = include.group(1)
include_unparsed_vars = include.group(2)
# Determine the include vars
include_vars = {}
if include_unparsed_vars is not None and len(include_unparsed_vars) > 0:
for meta in re.split(r"\s*,\s*", include_unparsed_vars):
key, value = re.split("\s*=\s*", meta, 2)
include_vars[key] = value
# Determine the include src/dest paths
include_file = os.path.join(os.path.dirname(book_src_dir), include_path)
relative_path = os.path.relpath(include_file, os.path.dirname(src_file))
# If the path is in another book, copy it into this one
relative_book_path = os.path.relpath(include_file, book_src_dir)
if relative_book_path.startswith("../"):
src_book_relative_dir = os.path.relpath(book_src_dir,info["src_dir"])
dest_include_dir = os.path.join(dest_dir, src_book_relative_dir, "includes")
relative_path = os.path.join(
os.path.relpath(dest_include_dir, parent_dir),
os.path.basename(include_file),
)
else:
dest_include_dir = os.path.abspath(
os.path.join(
os.path.dirname(dest_file), os.path.dirname(relative_path)
)
)
dest_include_file = os.path.join(
dest_include_dir, os.path.basename(include_file)
)
# Make sure we have a reference to the current working dir
current_dir = cwd or os.path.dirname(src_file)
include_tag = include_vars.get("tag", None)
# Copy the file and fix the content
if not os.path.isfile(dest_include_file):
copy_file(
info,
book_src_dir,
include_file,
dest_dir,
dest_include_file,
image_files,
tag=include_tag,
cwd=current_dir,
)
else:
# The file has already been copied, so just fix the links for this tag
with open(dest_include_file, "r") as f:
include_content = f.read()
# Fix any links
include_content = fix_links(
include_content,
info,
book_src_dir,
include_file,
tag=include_tag,
cwd=cwd,
)
with open(dest_include_file, "w") as f:
f.write(include_content)
content = content.replace(
include_text, include.expand("include::" + relative_path + "[\\2]")
)
with open(dest_file, "w") as f:
f.write(content)
def detect_images(content, image_files):
"""
Detects all image file names referenced in the content, which is a readlines() output
Adds the filenames to the image_files set
Does NOT control for false positives such as commented out content,
because "false negatives" are worse
TEMPORARY: use both procedural and RE detection and report any misalignment
"""
image_pattern = re.compile(r'image::?([^\s\[]+)\[.*?\]')
for content_str in content:
image_files.update({os.path.basename(f) for f in image_pattern.findall(content_str)})
def scrub_file(info, book_src_dir, src_file, image_files, tag=None, cwd=None):
"""
Scrubs a file and returns the cleaned file contents.
"""
base_src_file = src_file.replace(info["src_dir"] + "/", "")
# added 1/Sep/2020
# to allow loading files like json and yaml from external sources, this
# procedure loads the file recognizing that it starts with http
# it then checks if it exists or not, and if it exists, returns the raw data
# data that it finds.
# modified 20/Aug/2024 to process https links which are preceded
# by an added directory (happens with hugeBook)
# Modified 05/Dec/2024 to allow for https links from openshift-kni repo.
# Check for both allowed URL patterns
https_pos = base_src_file.find("https://raw.githubusercontent.com/openshift/")
https_kni_pos = base_src_file.find("https://raw.githubusercontent.com/openshift-kni/")
if https_pos >= 0 or https_kni_pos >= 0:
# Ensure we start from the correct URL (either github or openshift-kni)
if https_kni_pos >= 0:
base_src_file = base_src_file[https_kni_pos:]
else:
base_src_file = base_src_file[https_pos:]
try:
response = requests.get(base_src_file)
if response:
return response.text
else:
raise ConnectionError("Malformed URL")
except Exception as exception:
log.error("An include file wasn't found: %s", base_src_file)
list_of_errors.append(f"An include file wasn't found: {base_src_file}")
sys.exit(-1)
# Get a list of predefined custom title ids for the file
title_ids = TITLE_IDS.get(base_src_file, {})
# Read in the source content
with open(src_file, "r") as f:
src_file_content = f.readlines()
# detect image references in the content
detect_images(src_file_content, image_files)
# Scrub the content
content = ""
header_found = content_found = False
current_id = None
for line in src_file_content:
# Ignore any leading blank lines, before any meaningful content is found
if line.strip() == "" and not content_found:
continue
# Replace lines containing commented xrefs
line = COMMENTED_XREF_RE.sub("// Removed commented line that contains an xref", line)
# Check if the line should be included in the output
if include_line(line):
content_found = True
# Setup the document header content/id
if not header_found and line.strip() != "" and line.startswith("="):
header_found = True
if (
info["all_in_one"]
and base_src_file in ALL_IN_ONE_SCRAP_TITLE
and line.startswith("= ")
):
continue
# Add a section id if one doesn't exist, so we have something to link to
elif current_id is None and src_file in info["file_to_id_map"]:
file_id = info["file_to_id_map"][src_file]
content += "[[" + file_id + "]]\n"
# Add a custom title id, if one is needed
elif line.startswith("=") and current_id is None:
for title in title_ids:
title_re = (
r"^=+ "
+ title.replace(".", "\\.").replace("?", "\\?")
+ "( (anchor|\[).*?)?(\n)?$"
)
if re.match(title_re, line):
content += "[[" + title_ids[title] + "]]\n"
# Set the current id based on the line content
if current_id is None and ID_RE.match(line.strip()):
current_id = line.strip()
elif current_id is not None and line.strip != "":
current_id = None
# Add the line to the processed content
content += line
# Fix up any duplicate ids
if base_src_file in DUPLICATE_IDS:
for duplicate_id, new_id in list(DUPLICATE_IDS[base_src_file].items()):
content = content.replace("[[" + duplicate_id + "]]", "[[" + new_id + "]]")
# Replace incorrect links with correct ones
if base_src_file in INCORRECT_LINKS:
for incorrect_link, fixed_link in list(INCORRECT_LINKS[base_src_file].items()):
content = content.replace(incorrect_link, fixed_link)
# Fix up the links
content = fix_links(content, info, book_src_dir, src_file, tag=tag, cwd=cwd)
return content
def include_line(line):
"""
Determines if a line should be included in the filtered output.
"""
if line in IGNORE_LINES:
return False
for macro in IGNORE_MACROS:
if line.startswith(":" + macro + ":"):
return False
return True
def fix_links(content, info, book_src_dir, src_file, tag=None, cwd=None):
"""
Fix any links that were done incorrectly and reference the output instead of the source content.
"""
if info["all_in_one"]:
content = _fix_links(content, info["src_dir"], src_file, info)
else:
# Determine if the tag should be passed when fixing the links. If it's in the same book, then process the entire file. If it's
# outside the book then don't process it.
if book_src_dir in src_file:
content = _fix_links(content, book_src_dir, src_file, info, cwd=cwd)
else:
content = _fix_links(
content, book_src_dir, src_file, info, tag=tag, cwd=cwd
)
return content
def dir_to_book_name(dir,src_file,info):
# find a book name by the directory
for book in info["book_nodes"]:
if book["Dir"] == dir:
return(book["Name"])
break
log.error(
'ERROR (%s): book not found for the directory %s',
src_file,
dir)
list_of_errors.append(f"ERROR ({src_file}): book not found for the directory {dir}")
return(dir)
def _fix_links(content, book_dir, src_file, info, tag=None, cwd=None):
"""
Fix any links that were done incorrectly and reference the output instead of the source content.
"""
current_book_name = dir_to_book_name(os.path.relpath(book_dir,info["src_dir"]),src_file,info)
# TODO Deal with xref so that they keep the proper path. Atm it'll just strip the path and leave only the id
file_to_id_map = info["file_to_id_map"]
current_dir = cwd or os.path.dirname(src_file)
cleaned_content = remove_conditional_content(content, info, tag=tag)
links = LINKS_RE.finditer(cleaned_content)
for link in links:
link_text = link.group(0)
link_file = link.group(1)
link_anchor = link.group(2)
link_title = link.group(3)
# sanity check - is this a link to an external site?
# apparently the link macro CAN be used for internal links too, so just testing for http
# NOTE: a docs.openshift.com link would not process here corectly, anyway, so let it pass through
if ("http:" in link_text) or ("https:" in link_text):
continue
fixed_link = "" # setting the scope of fixed_link outside the if statements
if link_file is not None:
fixed_link_file = link_file.replace(".html", ".adoc")
fixed_link_file_abs = os.path.abspath(
os.path.join(current_dir, fixed_link_file)
)
if fixed_link_file_abs in file_to_id_map:
# We are dealing with a cross reference to a book here
full_relative_path = os.path.relpath(fixed_link_file_abs,info["src_dir"])
if full_relative_path[:2]=="..":
log.error(
'ERROR (%s): link pointing outside source directory? %s',
src_file,
link_file)
list_of_errors.append(f'ERROR ({src_file}): link pointing outside source directory? {link_file}')
continue
split_relative_path = full_relative_path.split("/")
book_dir_name = split_relative_path[0]
if book_dir_name in info["huge_book_dirs"]:
book_dir_name = split_relative_path[0]+"/"+split_relative_path[1]
# Find the book name
book_name = dir_to_book_name(book_dir_name,src_file,info)
if book_name==current_book_name:
if link_anchor is None:
fixed_link = "xref:" + file_to_id_map[fixed_link_file_abs] + link_title
else:
fixed_link = "xref:" + link_anchor.replace("#", "") + link_title
else:
fixed_link_file = BASE_PORTAL_URL + build_portal_url(info, book_name)
if link_anchor is None:
fixed_link = (
"link:"
+ fixed_link_file
+ "#"
+ file_to_id_map[fixed_link_file_abs]
+ link_title
)
else:
fixed_link = "link:" + fixed_link_file + link_anchor + link_title
else:
# Cross reference or link that isn't in the docs suite
fixed_link = link_text
if EXTERNAL_LINK_RE.search(link_file) is not None:
rel_src_file = src_file.replace(os.path.dirname(book_dir) + "/", "")
link_text_message = link_text.replace("\n", "")
log.error(
'ERROR (%s): "%s" appears to try to reference a file not included in the "%s" distro',
rel_src_file,
link_text_message,
info["distro"],
)
list_of_errors.append(f'ERROR ({rel_src_file})): {link_text_message} appears to try to reference a file not included in the {info["distro"]} distro')
else:
fixed_link = "xref:" + link_anchor.replace("#", "") + link_title
content = content.replace(link_text, fixed_link)
return content
def remove_conditional_content(content, info, tag=None):
"""
Removes any conditional content that doesn't match for the specified distro
"""
# Remove any ifdef content
ifdef = IFDEF_RE.search(content)
while ifdef is not None:
is_not_def = ifdef.group(1) == "n"
ifdef_distros = ifdef.group(2).split(",")
pos = ifdef.start()
end = ifdef.end()
# Determine if we should strip the conditional content, based on the distro
strip_content = False
if is_not_def and info["distro"] in ifdef_distros:
strip_content = True
elif not is_not_def and info["distro"] not in ifdef_distros:
strip_content = True
# Remove the conditional content
if strip_content:
# Find the correct endif for the current ifdef
search_pos = end
endpos = len(content)
while True:
next_ifdef = IFDEF_RE.search(content, search_pos)
endif = ENDIF_RE.search(content, search_pos)
if not endif:
break
elif not next_ifdef or next_ifdef.start() > endif.start():
endpos = endif.end()
break
else:
search_pos = endif.end()
# Replace the content and move the end pos to be the same as the start since the content was removed
ifdef_text = content[pos:endpos]
content = content.replace(ifdef_text, "")
end = pos
# Move onto the next ifdef
ifdef = IFDEF_RE.search(content, end)
# Remove commented out content
for comment in COMMENT_CONTENT_RE.finditer(content):
content = content.replace(comment.group(0), "")
# Remove content outside of tags
if tag is not None:
for tag_match in TAG_CONTENT_RE.finditer(content):
tag_text = tag_match.group(0)
tag_label = tag_match.group(1)
if tag_label == tag:
# Tag matches, so only use the content in the tag
content = tag_text
return content
def collect_existing_ids(node, distro, path):
"""
Examines all nodes asciidoc file contents and returns any existing ids.
"""
book_ids = []
def topic_callback(topic_node, parent_dir, depth):
src_file = os.path.join(parent_dir, topic_node["File"] + ".adoc")
file_ids = extract_file_ids(src_file)
book_ids.extend(file_ids)
iter_tree(node, distro, topic_callback=topic_callback, parent_dir=path)
return book_ids
def build_file_to_id_map(node, distro, existing_ids, path=""):
"""
Builds a mapping of file names/paths to the root id for the file. This is used to fix the links that are done incorrectly.
"""
file_to_id_map = {}
def topic_callback(topic_node, parent_dir, depth):
src_file = os.path.join(parent_dir, topic_node["File"] + ".adoc")
file_to_id_map[src_file] = build_file_id(
topic_node["Name"], file_to_id_map, existing_ids
)
iter_tree(node, distro, topic_callback=topic_callback, parent_dir=path)
return file_to_id_map
def extract_file_ids(file_path):
"""
Extracts all the ids used in the specified file.
"""
with open(file_path, "r") as f:
content = f.read()
ids = ID_RE.finditer(content)
return [id.group(1) for id in ids]
def build_file_id(file_title, file_to_id_map, existing_ids):
"""
Generates a unique id for a file, based on its title.
"""
file_id = base_id = re.sub(
r"[\[\]\(\)#]", "", file_title.lower().replace("_", "-").replace(" ", "-")
)
count = 1
while file_id in existing_ids or file_id in list(file_to_id_map.values()):
file_id = base_id + "-" + str(count)
count += 1
return file_id
def build_portal_url(info, book_name):
"""
Builds a portal url path by escaping the content in the same way drupal does.
"""
product = info["product"]
version = info["product-version"]
return (
generate_url_from_name(product)
+ "/"
+ generate_url_from_name(version)
+ "/html-single/"
+ generate_url_from_name(book_name)
+ "/"
)
def replace_nbsp(val):
"""Replaces non breaking spaces with a regular space"""
if val is not None:
# Check if the string is unicode
if isinstance(val, str):
return val.replace("\xa0", " ")
else:
return val.replace("\xc2\xa0", " ")
else:
return None
def generate_url_from_name(name, delimiter="_"):
"""
Generates a url fragment from a product, version or titles name.
"""
# Remove characters that aren't allowed in urls
url = re.sub("^\.+|[^0-9a-zA-Z _\-.]+", "", replace_nbsp(name))
# Replace spaces with the delimiter
url = re.sub("\s+", delimiter, url)
# Replace multiple underscores with a single underscore
url = re.sub(delimiter + "+", delimiter, url)
return url.lower()
def call_git_command(*args, **kwargs):
"""
Calls a git command and retries the command if it is unable to connect to the remote repo
"""
retries = kwargs.pop("retries", 3)
try:
output = subprocess.check_output(*args, **kwargs)
if output is not None:
sys.stdout.write(output)
return output
except subprocess.CalledProcessError as e:
retries -= 1
if retries > 0 and "fatal: Could not read from remote repository" in e.output:
# Connection failed, so wait a couple of secs and try again
time.sleep(2)
call_git_command(*args, retries=retries, **kwargs)
else:
raise
def fetch_sources(url, branch, dir=None, clone_dirname=None):
"""
Fetches sources from a git repository. If the repository doesn't exist it'll be cloned into `dir_name`, otherwise if it already has been
cloned, the repo will just be updated.
"""
# Setup the defaults
if dir is None:
dir = os.getcwd()
if clone_dirname is None:
clone_dirname = url.split("/")[-1].replace(".git", "")
# If the dir already exists update the content, otherwise clone it
clone_dir = os.path.abspath(os.path.join(dir, clone_dirname))
if os.path.exists(os.path.join(clone_dir, ".git")):
cmd = ["git", "pull", "-f"]
cmd_dir = clone_dir
# Do a checkout to make sure we are on the right branch
checkout_cmd = ["git", "checkout", branch]
subprocess.check_output(checkout_cmd, cwd=cmd_dir, stderr=subprocess.STDOUT)
else:
cmd = ["git", "clone", "-b", branch, url, clone_dirname]
cmd_dir = os.path.abspath(dir)
# Execute the command
call_git_command(cmd, cwd=cmd_dir, stderr=subprocess.STDOUT)
def sync_directories(src_dir, dest_dir, ignore=None):
"""
Syncs two directories so that the both contain the same content, with the exception of ignored files.
"""
if ignore is None:
ignore = []
ignore.extend(CMP_IGNORE_FILES)
dcmp = filecmp.dircmp(src_dir, dest_dir, ignore)
_sync_directories_dircmp(dcmp)
def _sync_directories_dircmp(dcmp):
# Remove files that only exist in the dest directory
for filename in dcmp.right_only:
right = os.path.join(dcmp.right, filename)
if os.path.isfile(right):
os.remove(right)
else:
shutil.rmtree(right)
# Copy files that only exist in the source directory or files that have changed
for filename in dcmp.left_only + dcmp.common_files:
left = os.path.join(dcmp.left, filename)
right = os.path.join(dcmp.right, filename)
if os.path.isfile(left):
shutil.copy2(left, right)
else:
shutil.copytree(left, right)
# Sync sub directories
for subdcmp in list(dcmp.subdirs.values()):
_sync_directories_dircmp(subdcmp)
def commit_and_push_changes(git_dir, git_branch, git_upstream_branch):
"""
Adds, commits and pushes any changes to a local git repository.
"""
# Add all the changes
add_cmd = ["git", "add", "--all"]
subprocess.check_call(add_cmd, cwd=git_dir)
try:
# Commit the changes
commit_cmd = [
"git",
"commit",
"-m",
"Merge branch 'upstream/" + git_upstream_branch + "' into " + git_branch,
"--author",
"CCS OSE Build Script <no-reply@redhat.com>",
]
call_git_command(commit_cmd, cwd=git_dir, stderr=subprocess.STDOUT)
# Push the changes
push_cmd = ["git", "push"]
call_git_command(push_cmd, cwd=git_dir, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
if e.output is None or "nothing to commit" not in e.output:
raise
def parse_repo_config(config_file, distro, version):
# Make sure the repo config file exists
if not os.path.isfile(config_file):
log.error("Failed loading the repo configuration from %s", config_file)
sys.exit(-1)
parser = configparser.SafeConfigParser()
parser.read(config_file)
repo_urls = dict()
section_name = distro + "-" + version
if parser.has_section(section_name):
for (key, value) in parser.items(section_name):
repo_urls[key] = value
return repo_urls
def main():
parser = setup_parser()
args = parser.parse_args()
logging.basicConfig(format="%(message)s", level=logging.INFO, stream=sys.stdout)
# Copy down the latest files
if not args.no_upstream_fetch:
log.info("Fetching the upstream sources")
fetch_sources(args.upstream_url, args.upstream_branch, clone_dirname=CLONE_DIR)
config = find_build_config_file()
src_dir = os.path.dirname(config)
# Parse the build config
data = parse_build_config(config)
# Filter the list of books that should be built
book_nodes = [node for node in data if check_node_distro_matches(node, args.distro)]
# Make the new source tree
dest_dir = os.path.join(os.getcwd(), "drupal-build", args.distro)
if not args.no_clean:
log.info("Cleaning the drupal-build directory")
if os.path.exists(dest_dir):
shutil.rmtree(dest_dir)
os.makedirs(dest_dir)
elif not os.path.exists(dest_dir):
os.makedirs(dest_dir)
info = {
"title": args.title,
"product-author": args.author,
"product-version": args.version,
"product": args.product,
"distro": args.distro,
"src_dir": src_dir,
"dest_dir": dest_dir,
"data": data,
"book_nodes": book_nodes,
"all_in_one": args.all_in_one,
"preface-title": "",
"upstream_branch": args.upstream_branch,
"huge_book_dirs": []
}
# Build the master files
log.info("Building the drupal files")
build_master_files(info)
# Copy the original data and reformat for drupal
reformat_for_drupal(info)
if list_of_errors:
sys.exit(1)
if args.push:
# Parse the repo urls
config_file = os.path.join(os.path.dirname(__file__), "repos.ini")
repo_urls = parse_repo_config(config_file, args.distro, args.version)
# Make sure the base git dire exists
base_git_dir = os.path.join(os.getcwd(), "gitlab-repos")
ensure_directory(base_git_dir)
# Checkout the gitlab repo, copy the changes and push them back up
for book_dir, gitlab_repo_url in list(repo_urls.items()):
build_book_dir = os.path.join(dest_dir, book_dir)
git_dirname = gitlab_repo_url.split("/")[-1].replace(".git", "")
git_dir = os.path.join(base_git_dir, git_dirname)
try:
log.info("Fetching " + book_dir + " sources from GitLab")
fetch_sources(gitlab_repo_url, args.branch, base_git_dir, git_dirname)
log.info("Syncing " + book_dir)
sync_directories(build_book_dir, git_dir, ["docinfo.xml"])
log.info("Pushing " + book_dir + " changes back to GitLab")
commit_and_push_changes(git_dir, args.branch, args.upstream_branch)
except subprocess.CalledProcessError as e:
if e.output:
sys.stdout.write(e.output)
raise
if __name__ == "__main__":
main()