openshift-docs/build_for_portal.py

#!/usr/bin/python

# see notes in the build.py script as to what this file does

import argparse
import configparser
import filecmp
import fnmatch
import os
import re
import shutil
import subprocess
import sys
import tempfile
import time
import requests
import yaml

import logging

# See manual and pip3 install aura.tar.gz for logging
from aura import cli

cli.init_logging(False, True)

list_of_errors = []
CLONE_DIR = "."
BASE_PORTAL_URL = "https://docs.redhat.com/en/documentation/"
# ID_RE = re.compile("^\[(?:\[|id=\'|#)(.*?)(\'?,.*?)?(?:\]|\')?\]", re.M | re.DOTALL)
ID_RE = re.compile(
    "^\[(?:\[|id='|#|id=\")(.*?)('?,.*?)?(?:\]|'|\")?\]", re.M | re.DOTALL
)
LINKS_RE = re.compile(
    "(?:xref|link):([\./\w_-]*/?[\w_.-]*\.(?:html|adoc))?(#[\w_-]*)?(\[.*?\])",
    re.M | re.DOTALL,
)
EXTERNAL_LINK_RE = re.compile(
    "[\./]*([\w_-]+)/[\w_/-]*?([\w_.-]*\.(?:html|adoc))", re.DOTALL
)
INCLUDE_RE = re.compile(r"^include::(.*?)\[(.*?)\]", re.M)
IFDEF_RE = re.compile(r"^if(n?)def::(.*?)\[\]", re.M)
ENDIF_RE = re.compile(r"^endif::(.*?)\[\]\r?\n", re.M)
COMMENT_CONTENT_RE = re.compile(r"^^////$.*?^////$", re.M | re.DOTALL)
COMMENTED_XREF_RE = re.compile(r"^//.*xref:.*$")
TAG_CONTENT_RE = re.compile(
    r"//\s+tag::(.*?)\[\].*?// end::(.*?)\[\]", re.M | re.DOTALL
)
CMP_IGNORE_FILES = [".git", ".gitignore", "README.md", "build.cfg"]
DEVNULL = open(os.devnull, "wb")

MASTER_FILE_BASE = "= {title}\n\
:product-author: {product-author}\n\
:product-title: {product}\n\
:product-version: {product-version}\n\
:{distro}:\n\
:imagesdir: images\n\
:idseparator: -\n\
{preface-title}\n"

DOCINFO_BASE = '<title>{title}</title>\n\
<productname>{{product-title}}</productname>\n\
<productnumber>{{product-version}}</productnumber>\n\
<subtitle>Enter a short description here.</subtitle>\n\
<abstract>\n\
    <para>A short overview and summary of the book\'s subject and purpose, traditionally no more than one paragraph long.</para>\n\
</abstract>\n\
<authorgroup>\n\
    <orgname>{product-author}</orgname>\n\
</authorgroup>\n\
<xi:include href="Common_Content/Legal_Notice.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />\n'

# A list of book titles, that still use the old drupal url format (ie includes the product/version in the book title part)
# eg. openshift-enterprise/version-3.0/openshift-enterprise-30-getting-started vs openshift-enterprise/version-3.0/getting-started
DRUPAL_OLD_URL_TITLES = [
    "Administrator Guide",
    "Architecture",
    "CLI Reference",
    "Creating Images",
    "Developer Guide",
    "Getting Started",
    "REST API Reference",
    "Using Images",
    "What's New?",
]

# A mapping of upstream book/category names to CP book names
BOOK_NAME_OVERRIDES = {"Administration": "Administrator Guide"}

# Lines that should be stripped out/ignored when cleaning the content
IGNORE_LINES = [
    "{product-author}\n",
    "{product-version}\n",
    "{product-version]\n",
    "{Lucas Costi}\n",
    "toc::[]\n",
]

# Each MACRO in this list is omitted from the output
# if the input appears as ':MACRO:' (colon, MACRO, colon).
IGNORE_MACROS = ["description", "keywords", "icons", "data-uri", "toc", "toc-title"]

# Files where the title should be removed when building the all-in-one
ALL_IN_ONE_SCRAP_TITLE = ["welcome/index.adoc"]

# Files that should be commented out in the toc structure
COMMENT_FILES = [
    "admin_guide/overview.adoc",
    "creating_images/overview.adoc",
    "dev_guide/overview.adoc",
    "using_images/overview.adoc",
    "rest_api/overview.adoc",
]

# Map FILENAME to a map of TITLE to ID.  In most of the cases the
# ID is the TITLE downcased, with "strange" chars replaced by hyphen.
# A notable exception is 'any' TITLE.
TITLE_IDS = {}
# A dictionary of existing dup ids to new unique ids
DUPLICATE_IDS = {}
# Map FILENAME to a map of BAD to GOOD.  Most of the time, BAD and GOOD
# are in link syntax, i.e., beginning with "link:", but not always.
INCORRECT_LINKS = {}

log = logging.getLogger("build")


def setup_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--distro", help="The distribution to build for", default="openshift-enterprise"
    )
    parser.add_argument("--all-in-one", help=argparse.SUPPRESS, action="store_true")
    parser.add_argument("--title", help=argparse.SUPPRESS, default="Documentation")
    parser.add_argument("--product", default="OpenShift Enterprise")
    parser.add_argument("--version", default="3.0")
    parser.add_argument("--author", default="Red Hat OpenShift Documentation Team")
    parser.add_argument(
        "--upstream-url",
        help="The upstream source url",
        default="https://github.com/openshift/openshift-docs.git",
    )
    parser.add_argument(
        "--upstream-branch", help="The upstream source branch", default="enterprise-3.0"
    )
    parser.add_argument(
        "--branch", help="The GitLab branch to commit changes into", default="GA"
    )
    parser.add_argument(
        "-p",
        "--push",
        help="Commit and push the changes into GitLab",
        action="store_true",
    )
    parser.add_argument(
        "--no-clean",
        help="Don't clean the drupal-build directory before building",
        action="store_true",
    )
    parser.add_argument(
        "--no-upstream-fetch",
        help="Don't fetch the upstream sources",
        action="store_true",
    )
    return parser


def find_build_config_file():
    """
    Finds the build config file to use, as it might be _topic_map.yml or _build_cfg.yml
    """

    # updated 23rd Nov to support files in _topic_maps folder

    # load everything from the _topic_maps folder
    file_list = os.listdir(os.path.join(CLONE_DIR, "_topic_maps"))

    # create a temp file combining all values from that folder
    # don't delete it immediately, and give it a suffix of swp which makes it ignored by git
    with tempfile.NamedTemporaryFile(dir=CLONE_DIR, delete=False, suffix=".swp") as tmp:
        for f in file_list:
            with open(os.path.join(CLONE_DIR, "_topic_maps", f), "rb") as infile:
                tmp.write(infile.read())

    config = os.path.abspath(tmp.name)
    log.info(config)

    # backup look for a single _topic_map in the cloned directory
    if not os.path.isfile(config):
        config = os.path.abspath(os.path.join(CLONE_DIR, "_topic_map.yml"))

    return config


def parse_build_config(config):
    """
    Parses the build config and returns a tree based structure for the config.
    """
    config = os.path.expanduser(config)
    with open(config, "r") as f:
        data = list(yaml.load_all(f, Loader=yaml.FullLoader))

    for book in data:
        book_name = book["Name"]
        if book_name in BOOK_NAME_OVERRIDES:
            book["Name"] = BOOK_NAME_OVERRIDES[book_name]

    return data


def iter_tree(
    node,
    distro,
    dir_callback=None,
    topic_callback=None,
    include_path=True,
    parent_dir="",
    depth=0,
):
    """
    Iterates over a build config tree starting from a specific node, skipping content where the distro doesn't match.
    Additionally, calls are made to the dir_callback or topic_callback functions when a directory or topic is found.
    """
    if "Topics" in node:
        if check_node_distro_matches(node, distro):
            if include_path:
                topics_dir = os.path.join(parent_dir, node["Dir"])
            else:
                topics_dir = ""

            if dir_callback is not None:
                dir_callback(node, parent_dir, depth)

            for topic in node["Topics"]:
                iter_tree(
                    topic,
                    distro,
                    dir_callback,
                    topic_callback,
                    True,
                    topics_dir,
                    depth + 1,
                )
    elif check_node_distro_matches(node, distro):
        if topic_callback is not None:
            topic_callback(node, parent_dir, depth)


def check_node_distro_matches(node, distro):
    """
    Checks to see if the specified distro matches a distro in the nodes distros list. If there is no distros list specified on the
    node then all distros are allowed, so return true.
    """
    if "Distros" not in node:
        return True
    else:
        node_distros = [x.strip() for x in node["Distros"].split(",")]
        for node_distro in node_distros:
            # Check for an exact match, or a glob match
            if node_distro == distro or fnmatch.fnmatchcase(distro, node_distro):
                return True

    return False


def ensure_directory(directory):
    """
    Creates DIRECTORY if it does not exist.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

def expand_huge_books(info):
    """
    Finds nodes for huge books, creates new nodes for books from their top-level topics,
    and then removes the nodes for huge books
    """

    # find all the huge books, sa defined by nodes
    huge_book_nodes = [book for book in info["book_nodes"]
        if os.path.exists(os.path.join(info["src_dir"],book["Dir"],"hugeBook.flag")) ]


    for book in huge_book_nodes:
            # save the directory in info
            huge_book_dir = book["Dir"]
            info["huge_book_dirs"].append(huge_book_dir)
            # create the flag file in the book destination directory
            book_dest_dir = os.path.join(info["dest_dir"], book["Dir"])
            ensure_directory(book_dest_dir)
            with open(os.path.join(book_dest_dir,"hugeBook.flag"),"w") as fi:
                fi.write("hugebook")
            # make new book nodes for the second-level headings
            for topic in book["Topics"]:
                if "Dir" in topic.keys():
                    info["book_nodes"].append(topic)
                    topic["Dir"] = huge_book_dir + "/" + topic["Dir"]

    # remove book nodes for huge books
    for node_to_remove in huge_book_nodes:
        info["book_nodes"].remove(node_to_remove)


def build_master_files(info):
    """
    Builds the master.adoc and docinfo.xml files for each guide specified in the config.
    """

    # change the huge books into sub-books
    expand_huge_books(info)

    # TODO: Refactor. This does too much.

    dest_dir = info["dest_dir"]
    all_in_one = info["all_in_one"]
    all_in_one_text = ""

    for book in info["book_nodes"]:

        book_dest_dir = os.path.join(dest_dir, book["Dir"])
        ensure_directory(book_dest_dir)

        book_info = dict(info)
        book_info["title"] = book["Name"]

        master: str = generate_master_entry(
            book, book["Dir"], info["distro"], all_in_one, all_in_one=all_in_one
        )

        # Save the content
        if not all_in_one:
            master_file = os.path.join(book_dest_dir, "master.adoc")
            docinfo_file = os.path.join(book_dest_dir, "docinfo.xml")
            master_base = MASTER_FILE_BASE.format(**book_info)

            log.debug("Writing " + master_file)
            with open(master_file, "w") as f:
                f.write(master_base + master)
            log.debug("Writing " + docinfo_file)
            with open(docinfo_file, "w") as f:
                f.write(DOCINFO_BASE.format(**book_info))
        else:
            # TODO: Do we ever use this?
            if all_in_one_text == "":
                # Remove the title for the first file in the book
                master = master.replace("= " + book["Name"] + "\n", "")

                # Set the preface title from the first file in the book
                first_file = os.path.join(
                    info["src_dir"], book["Dir"], book["Topics"][0]["File"] + ".adoc"
                )
                preface_title = None
                with open(first_file, "r") as f:
                    line = f.readline()
                    while line:
                        if include_line(line):
                            preface_title = re.sub("^=+ ", "", line)
                            break
                        line = f.readline()
                if preface_title is not None:
                    info["preface-title"] = ":preface-title: " + preface_title
            all_in_one_text += master

    # TODO: And is this ever used?
    if all_in_one:
        master_file = os.path.join(dest_dir, "master.adoc")
        docinfo_file = os.path.join(dest_dir, "docinfo.xml")

        master_base = MASTER_FILE_BASE.format(**info)

        log.debug("Writing " + master_file)
        with open(master_file, "w") as f:
            f.write(master_base + all_in_one_text)
        log.debug("Writing " + docinfo_file)
        with open(docinfo_file, "w") as f:
            f.write(DOCINFO_BASE.format(**info))


def generate_master_entry(
    node: dict, book_dir: str, distro: str, include_name=True, all_in_one=False
):
    """
    Given a node (book dict), generate content for that node's master.adoc file.
    """
    master_entries = []

    def dir_callback(dir_node, parent_dir, depth):
        if include_name or depth > 0:
            master_entries.append(
                "=" * (depth + 1) + " " + dir_node["Name"].replace("\\", "")
            )

    def topic_callback(topic_node, parent_dir, depth):
        book_file_path = os.path.join(parent_dir, topic_node["File"] + ".adoc")
        file_path = os.path.join(book_dir, book_file_path)
        include = "include::" + book_file_path + "[leveloffset=+" + str(depth) + "]"
        if not all_in_one and file_path in COMMENT_FILES:
            master_entries.append("////")
            master_entries.append(include)
            master_entries.append("////")
        else:
            master_entries.append(include)
        # Add a blank line
        master_entries.append("")

    # Iterate over the tree and build the master.adoc content
    iter_tree(node, distro, dir_callback, topic_callback, include_name)
    return "\n".join(master_entries)


def reformat_for_drupal(info):
    """
    Reformats the source content for use in the Customer Portal. This function does the following:

    - Copies images over and flattens them into a single dir
    - Copies source asciidoc over
    - Filters the AsciiDoc source to remove duplicate macro definitions, that should only be in the main file.
    - Adds id's for each file, so the files can be properly cross referenced.
    - Adds id's to sections that are cross referenced, but have no id.
    - Fixes duplicate id's in the source content.
    - Fixes links that have been done incorrectly and should be cross references instead.
    """
    books = info["book_nodes"]
    src_dir = info["src_dir"]
    dest_dir = info["dest_dir"]
    distro = info["distro"]

    # Build a mapping of files to ids
    # Note: For all-in-one we have to collect ids from all books first
    file_to_id_map = {}
    if info["all_in_one"]:
        book_ids = []
        for book in books:
            book_ids.extend(collect_existing_ids(book, distro, src_dir))
        for book in books:
            file_to_id_map.update(build_file_to_id_map(book, distro, book_ids, src_dir))
    else:
        for book in books:
            book_ids = collect_existing_ids(book, distro, src_dir)
            file_to_id_map.update(build_file_to_id_map(book, distro, book_ids, src_dir))
    info["file_to_id_map"] = file_to_id_map

    # Reformat the data
    for book in books:

        log.info("Processing %s", book["Dir"])
        book_src_dir = os.path.join(src_dir, book["Dir"])

        if info["all_in_one"]:
            images_dir = os.path.join(dest_dir, "images")
        else:
            book_dest_dir = os.path.join(dest_dir, book["Dir"])
            images_dir = os.path.join(book_dest_dir, "images")

        ensure_directory(images_dir)

        # ADDED 21 Jan 2025: selective processing of images
        # the set of file names is to be stored in image_files
        # The initial value includes images defined in attributes (to copy every time)
        image_files = set()

        log.debug("Copying source files for " + book["Name"])
        copy_files(book, book_src_dir, src_dir, dest_dir, info, image_files)

        log.debug("Copying images for " + book["Name"])
        copy_images(book, src_dir, images_dir, distro, image_files)


def copy_images(node, src_path, dest_dir, distro, image_files):
    """
    Copy images over to the destination directory and flatten all image directories into the one top level dir.

    REWORKED 21 Jan 2025: we now assume that there is a single images directory and
     that all other images subdirectories are simply symlinks into it. So we do not
     iterate over the tree but simply copy the necessary files from that one images directory
    """

    images_source_dir = os.path.join(src_path, "images")
    for image_file_name in image_files:
        image_file_pathname = os.path.join(images_source_dir,image_file_name)
        if os.path.exists(image_file_pathname):
            shutil.copy(image_file_pathname, dest_dir)
        # if an image file is not found, this is not an error, because it might
        # have been picked up from a commented-out line. Actual missing images
        # should be caught by the asciidoctor/asciibinder part of CI


def copy_files(node, book_src_dir, src_dir, dest_dir, info, image_files):
    """
    Recursively copy files from the source directory to the destination directory, making sure to scrub the content, add id's where the
    content is referenced elsewhere and fix any links that should be cross references.
    """

    def dir_callback(dir_node, parent_dir, depth):
        node_dest_dir = os.path.join(dest_dir, parent_dir, dir_node["Dir"])
        ensure_directory(node_dest_dir)

    def topic_callback(topic_node, parent_dir, depth):
        node_src_dir = os.path.join(src_dir, parent_dir)
        node_dest_dir = os.path.join(dest_dir, parent_dir)

        src_file = os.path.join(node_src_dir, topic_node["File"] + ".adoc")
        dest_file = os.path.join(node_dest_dir, topic_node["File"] + ".adoc")

        # Copy the file
        copy_file(info, book_src_dir, src_file, dest_dir, dest_file, image_files)

    iter_tree(node, info["distro"], dir_callback, topic_callback)


def copy_file(
    info,
    book_src_dir,
    src_file,
    dest_dir,
    dest_file,
    image_files,
    include_check=True,
    tag=None,
    cwd=None,
):
    """
    Copies a source file to destination, making sure to scrub the content, add id's where the content is referenced elsewhere and fix any
    links that should be cross references. Also copies any includes that are referenced, since they aren't included in _build_cfg.yml.
    """

    # It's possible that the file might have been created by another include, if so then just return
    if os.path.isfile(dest_file):
        return

    # Touch the dest file, so we can handle circular includes
    parent_dir = os.path.dirname(dest_file)
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)
    # os.mknod(dest_file)
    open(dest_file, "w").close()
    # Scrub/fix the content
    content = scrub_file(info, book_src_dir, src_file, image_files, tag=tag, cwd=cwd)

    # Check for any includes
    if include_check:
        cleaned_content = remove_conditional_content(content, info)
        include_iter = INCLUDE_RE.finditer(cleaned_content)
        for include in include_iter:
            include_text = include.group(0)
            include_path = include.group(1)
            include_unparsed_vars = include.group(2)

            # Determine the include vars
            include_vars = {}
            if include_unparsed_vars is not None and len(include_unparsed_vars) > 0:
                for meta in re.split(r"\s*,\s*", include_unparsed_vars):
                    key, value = re.split("\s*=\s*", meta, 2)
                    include_vars[key] = value


            # Determine the include src/dest paths
            include_file = os.path.join(os.path.dirname(book_src_dir), include_path)
            relative_path = os.path.relpath(include_file, os.path.dirname(src_file))

            # If the path is in another book, copy it into this one
            relative_book_path = os.path.relpath(include_file, book_src_dir)

            if relative_book_path.startswith("../"):
                src_book_relative_dir = os.path.relpath(book_src_dir,info["src_dir"])
                dest_include_dir = os.path.join(dest_dir, src_book_relative_dir, "includes")
                relative_path = os.path.join(
                    os.path.relpath(dest_include_dir, parent_dir),
                    os.path.basename(include_file),
                )
            else:
                dest_include_dir = os.path.abspath(
                    os.path.join(
                        os.path.dirname(dest_file), os.path.dirname(relative_path)
                    )
                )
            dest_include_file = os.path.join(
                dest_include_dir, os.path.basename(include_file)
            )

            # Make sure we have a reference to the current working dir
            current_dir = cwd or os.path.dirname(src_file)
            include_tag = include_vars.get("tag", None)

            # Copy the file and fix the content
            if not os.path.isfile(dest_include_file):
                copy_file(
                    info,
                    book_src_dir,
                    include_file,
                    dest_dir,
                    dest_include_file,
                    image_files,
                    tag=include_tag,
                    cwd=current_dir,
                )
            else:
                # The file has already been copied, so just fix the links for this tag
                with open(dest_include_file, "r") as f:
                    include_content = f.read()

                # Fix any links
                include_content = fix_links(
                    include_content,
                    info,
                    book_src_dir,
                    include_file,
                    tag=include_tag,
                    cwd=cwd,
                )

                with open(dest_include_file, "w") as f:
                    f.write(include_content)

            content = content.replace(
                include_text, include.expand("include::" + relative_path + "[\\2]")
            )

    with open(dest_file, "w") as f:
        f.write(content)

def detect_images(content, image_files):
    """
    Detects all image file names referenced in the content, which is a readlines() output
    Adds the filenames to the image_files set
    Does NOT control for false positives such as commented out content,
        because "false negatives" are worse

    TEMPORARY: use both procedural and RE detection and report any misalignment
    """
    image_pattern = re.compile(r'image::?([^\s\[]+)\[.*?\]')

    for content_str in content:
        image_files.update({os.path.basename(f) for f in image_pattern.findall(content_str)})

def scrub_file(info, book_src_dir, src_file, image_files, tag=None, cwd=None):
    """
    Scrubs a file and returns the cleaned file contents.
    """
    base_src_file = src_file.replace(info["src_dir"] + "/", "")

    # added 1/Sep/2020
    # to allow loading files like json and yaml from external sources, this
    # procedure loads the file recognizing that it starts with http
    # it then checks if it exists or not, and if it exists, returns the raw data
    # data that it finds.
    # modified 20/Aug/2024 to process https links which are preceded
    # by an added directory (happens with hugeBook)
    # Modified 05/Dec/2024 to allow for https links from openshift-kni repo.

    # Check for both allowed URL patterns
    https_pos = base_src_file.find("https://raw.githubusercontent.com/openshift/")
    https_kni_pos = base_src_file.find("https://raw.githubusercontent.com/openshift-kni/")

    if https_pos >= 0 or https_kni_pos >= 0:
        # Ensure we start from the correct URL (either github or openshift-kni)
        if https_kni_pos >= 0:
            base_src_file = base_src_file[https_kni_pos:]
        else:
            base_src_file = base_src_file[https_pos:]

        try:
            response = requests.get(base_src_file)
            if response:
                return response.text
            else:
                raise ConnectionError("Malformed URL")
        except Exception as exception:
            log.error("An include file wasn't found: %s", base_src_file)
            list_of_errors.append(f"An include file wasn't found: {base_src_file}")
            sys.exit(-1)

    # Get a list of predefined custom title ids for the file
    title_ids = TITLE_IDS.get(base_src_file, {})

    # Read in the source content
    with open(src_file, "r") as f:
        src_file_content = f.readlines()

    # detect image references in the content
    detect_images(src_file_content, image_files)

    # Scrub the content
    content = ""
    header_found = content_found = False
    current_id = None
    for line in src_file_content:
        # Ignore any leading blank lines, before any meaningful content is found
        if line.strip() == "" and not content_found:
            continue

        # Replace lines containing commented xrefs
        line = COMMENTED_XREF_RE.sub("// Removed commented line that contains an xref", line)

        # Check if the line should be included in the output
        if include_line(line):
            content_found = True

            # Setup the document header content/id
            if not header_found and line.strip() != "" and line.startswith("="):
                header_found = True

                if (
                    info["all_in_one"]
                    and base_src_file in ALL_IN_ONE_SCRAP_TITLE
                    and line.startswith("= ")
                ):
                    continue
                # Add a section id if one doesn't exist, so we have something to link to
                elif current_id is None and src_file in info["file_to_id_map"]:
                    file_id = info["file_to_id_map"][src_file]
                    content += "[[" + file_id + "]]\n"
            # Add a custom title id, if one is needed
            elif line.startswith("=") and current_id is None:
                for title in title_ids:
                    title_re = (
                        r"^=+ "
                        + title.replace(".", "\\.").replace("?", "\\?")
                        + "( (anchor|\[).*?)?(\n)?$"
                    )
                    if re.match(title_re, line):
                        content += "[[" + title_ids[title] + "]]\n"

            # Set the current id based on the line content
            if current_id is None and ID_RE.match(line.strip()):
                current_id = line.strip()
            elif current_id is not None and line.strip != "":
                current_id = None

            # Add the line to the processed content
            content += line

    # Fix up any duplicate ids
    if base_src_file in DUPLICATE_IDS:
        for duplicate_id, new_id in list(DUPLICATE_IDS[base_src_file].items()):
            content = content.replace("[[" + duplicate_id + "]]", "[[" + new_id + "]]")

    # Replace incorrect links with correct ones
    if base_src_file in INCORRECT_LINKS:
        for incorrect_link, fixed_link in list(INCORRECT_LINKS[base_src_file].items()):
            content = content.replace(incorrect_link, fixed_link)

    # Fix up the links
    content = fix_links(content, info, book_src_dir, src_file, tag=tag, cwd=cwd)

    return content


def include_line(line):
    """
    Determines if a line should be included in the filtered output.
    """
    if line in IGNORE_LINES:
        return False

    for macro in IGNORE_MACROS:
        if line.startswith(":" + macro + ":"):
            return False

    return True


def fix_links(content, info, book_src_dir, src_file, tag=None, cwd=None):
    """
    Fix any links that were done incorrectly and reference the output instead of the source content.
    """
    if info["all_in_one"]:
        content = _fix_links(content, info["src_dir"], src_file, info)
    else:
        # Determine if the tag should be passed when fixing the links. If it's in the same book, then process the entire file. If it's
        # outside the book then don't process it.
        if book_src_dir in src_file:
            content = _fix_links(content, book_src_dir, src_file, info, cwd=cwd)
        else:
            content = _fix_links(
                content, book_src_dir, src_file, info, tag=tag, cwd=cwd
            )
    return content

def dir_to_book_name(dir,src_file,info):
    # find a book name by the directory
    for book in info["book_nodes"]:
        if book["Dir"] == dir:
            return(book["Name"])
            break

    log.error(
        'ERROR (%s): book not found for the directory %s',
        src_file,
        dir)
    list_of_errors.append(f"ERROR ({src_file}): book not found for the directory {dir}")
    return(dir)


def _fix_links(content, book_dir, src_file, info, tag=None, cwd=None):
    """
    Fix any links that were done incorrectly and reference the output instead of the source content.
    """
    current_book_name = dir_to_book_name(os.path.relpath(book_dir,info["src_dir"]),src_file,info)

    # TODO Deal with xref so that they keep the proper path. Atm it'll just strip the path and leave only the id
    file_to_id_map = info["file_to_id_map"]
    current_dir = cwd or os.path.dirname(src_file)
    cleaned_content = remove_conditional_content(content, info, tag=tag)
    links = LINKS_RE.finditer(cleaned_content)

    for link in links:
        link_text = link.group(0)
        link_file = link.group(1)
        link_anchor = link.group(2)
        link_title = link.group(3)

        # sanity check - is this a link to an external site?
        # apparently the link macro CAN be used for internal links too, so just testing for http
        # NOTE: a docs.openshift.com link would not process here corectly, anyway, so let it pass through
        if ("http:" in link_text) or ("https:" in link_text):
            continue

        fixed_link = "" # setting the scope of fixed_link outside the if statements

        if link_file is not None:
            fixed_link_file = link_file.replace(".html", ".adoc")
            fixed_link_file_abs = os.path.abspath(
                os.path.join(current_dir, fixed_link_file)
            )
            if fixed_link_file_abs in file_to_id_map:

                # We are dealing with a cross reference to a book here
                full_relative_path = os.path.relpath(fixed_link_file_abs,info["src_dir"])

                if full_relative_path[:2]=="..":
                    log.error(
                        'ERROR (%s): link pointing outside source directory? %s',
                        src_file,
                        link_file)
                    list_of_errors.append(f'ERROR ({src_file}): link pointing outside source directory? {link_file}')
                    continue
                split_relative_path = full_relative_path.split("/")
                book_dir_name = split_relative_path[0]
                if book_dir_name in info["huge_book_dirs"]:
                    book_dir_name = split_relative_path[0]+"/"+split_relative_path[1]

                # Find the book name
                book_name = dir_to_book_name(book_dir_name,src_file,info)


                if book_name==current_book_name:
                    if link_anchor is None:
                        fixed_link = "xref:" + file_to_id_map[fixed_link_file_abs] + link_title
                    else:
                        fixed_link = "xref:" + link_anchor.replace("#", "") + link_title
                else:
                    fixed_link_file = BASE_PORTAL_URL + build_portal_url(info, book_name)
                    if link_anchor is None:
                        fixed_link = (
                            "link:"
                            + fixed_link_file
                            + "#"
                            + file_to_id_map[fixed_link_file_abs]
                            + link_title
                        )
                    else:
                        fixed_link = "link:" + fixed_link_file + link_anchor + link_title
            else:
                # Cross reference or link that isn't in the docs suite
                fixed_link = link_text
                if EXTERNAL_LINK_RE.search(link_file) is not None:
                    rel_src_file = src_file.replace(os.path.dirname(book_dir) + "/", "")
                    link_text_message = link_text.replace("\n", "")
                    log.error(
                        'ERROR (%s): "%s" appears to try to reference a file not included in the "%s" distro',
                        rel_src_file,
                        link_text_message,
                        info["distro"],
                    )
                    list_of_errors.append(f'ERROR ({rel_src_file})): {link_text_message} appears to try to reference a file not included in the {info["distro"]} distro')
        else:
            fixed_link = "xref:" + link_anchor.replace("#", "") + link_title

        content = content.replace(link_text, fixed_link)

    return content


def remove_conditional_content(content, info, tag=None):
    """
    Removes any conditional content that doesn't match for the specified distro
    """
    # Remove any ifdef content
    ifdef = IFDEF_RE.search(content)
    while ifdef is not None:
        is_not_def = ifdef.group(1) == "n"
        ifdef_distros = ifdef.group(2).split(",")
        pos = ifdef.start()
        end = ifdef.end()

        # Determine if we should strip the conditional content, based on the distro
        strip_content = False
        if is_not_def and info["distro"] in ifdef_distros:
            strip_content = True
        elif not is_not_def and info["distro"] not in ifdef_distros:
            strip_content = True

        # Remove the conditional content
        if strip_content:
            # Find the correct endif for the current ifdef
            search_pos = end
            endpos = len(content)
            while True:
                next_ifdef = IFDEF_RE.search(content, search_pos)
                endif = ENDIF_RE.search(content, search_pos)

                if not endif:
                    break
                elif not next_ifdef or next_ifdef.start() > endif.start():
                    endpos = endif.end()
                    break
                else:
                    search_pos = endif.end()

            # Replace the content and move the end pos to be the same as the start since the content was removed
            ifdef_text = content[pos:endpos]
            content = content.replace(ifdef_text, "")
            end = pos

        # Move onto the next ifdef
        ifdef = IFDEF_RE.search(content, end)

    # Remove commented out content
    for comment in COMMENT_CONTENT_RE.finditer(content):
        content = content.replace(comment.group(0), "")

    # Remove content outside of tags
    if tag is not None:
        for tag_match in TAG_CONTENT_RE.finditer(content):
            tag_text = tag_match.group(0)
            tag_label = tag_match.group(1)
            if tag_label == tag:
                # Tag matches, so only use the content in the tag
                content = tag_text

    return content


def collect_existing_ids(node, distro, path):
    """
    Examines all nodes asciidoc file contents and returns any existing ids.
    """
    book_ids = []

    def topic_callback(topic_node, parent_dir, depth):
        src_file = os.path.join(parent_dir, topic_node["File"] + ".adoc")
        file_ids = extract_file_ids(src_file)
        book_ids.extend(file_ids)

    iter_tree(node, distro, topic_callback=topic_callback, parent_dir=path)

    return book_ids


def build_file_to_id_map(node, distro, existing_ids, path=""):
    """
    Builds a mapping of file names/paths to the root id for the file. This is used to fix the links that are done incorrectly.
    """
    file_to_id_map = {}

    def topic_callback(topic_node, parent_dir, depth):
        src_file = os.path.join(parent_dir, topic_node["File"] + ".adoc")
        file_to_id_map[src_file] = build_file_id(
            topic_node["Name"], file_to_id_map, existing_ids
        )

    iter_tree(node, distro, topic_callback=topic_callback, parent_dir=path)
    return file_to_id_map


def extract_file_ids(file_path):
    """
    Extracts all the ids used in the specified file.
    """
    with open(file_path, "r") as f:
        content = f.read()

    ids = ID_RE.finditer(content)
    return [id.group(1) for id in ids]


def build_file_id(file_title, file_to_id_map, existing_ids):
    """
    Generates a unique id for a file, based on its title.
    """
    file_id = base_id = re.sub(
        r"[\[\]\(\)#]", "", file_title.lower().replace("_", "-").replace(" ", "-")
    )
    count = 1
    while file_id in existing_ids or file_id in list(file_to_id_map.values()):
        file_id = base_id + "-" + str(count)
        count += 1

    return file_id


def build_portal_url(info, book_name):
    """
    Builds a portal url path by escaping the content in the same way drupal does.
    """
    product = info["product"]
    version = info["product-version"]

    return (
        generate_url_from_name(product)
        + "/"
        + generate_url_from_name(version)
        + "/html-single/"
        + generate_url_from_name(book_name)
        + "/"
    )


def replace_nbsp(val):
    """Replaces non breaking spaces with a regular space"""
    if val is not None:
        # Check if the string is unicode
        if isinstance(val, str):
            return val.replace("\xa0", " ")
        else:
            return val.replace("\xc2\xa0", " ")
    else:
        return None


def generate_url_from_name(name, delimiter="_"):
    """
    Generates a url fragment from a product, version or titles name.
    """
    # Remove characters that aren't allowed in urls
    url = re.sub("^\.+|[^0-9a-zA-Z _\-.]+", "", replace_nbsp(name))
    # Replace spaces with the delimiter
    url = re.sub("\s+", delimiter, url)
    # Replace multiple underscores with a single underscore
    url = re.sub(delimiter + "+", delimiter, url)
    return url.lower()


def call_git_command(*args, **kwargs):
    """
    Calls a git command and retries the command if it is unable to connect to the remote repo
    """
    retries = kwargs.pop("retries", 3)
    try:
        output = subprocess.check_output(*args, **kwargs)
        if output is not None:
            sys.stdout.write(output)
        return output
    except subprocess.CalledProcessError as e:
        retries -= 1
        if retries > 0 and "fatal: Could not read from remote repository" in e.output:
            # Connection failed, so wait a couple of secs and try again
            time.sleep(2)
            call_git_command(*args, retries=retries, **kwargs)
        else:
            raise


def fetch_sources(url, branch, dir=None, clone_dirname=None):
    """
    Fetches sources from a git repository. If the repository doesn't exist it'll be cloned into `dir_name`, otherwise if it already has been
    cloned, the repo will just be updated.
    """
    # Setup the defaults
    if dir is None:
        dir = os.getcwd()
    if clone_dirname is None:
        clone_dirname = url.split("/")[-1].replace(".git", "")

    # If the dir already exists update the content, otherwise clone it
    clone_dir = os.path.abspath(os.path.join(dir, clone_dirname))
    if os.path.exists(os.path.join(clone_dir, ".git")):
        cmd = ["git", "pull", "-f"]
        cmd_dir = clone_dir

        # Do a checkout to make sure we are on the right branch
        checkout_cmd = ["git", "checkout", branch]
        subprocess.check_output(checkout_cmd, cwd=cmd_dir, stderr=subprocess.STDOUT)
    else:
        cmd = ["git", "clone", "-b", branch, url, clone_dirname]
        cmd_dir = os.path.abspath(dir)

    # Execute the command
    call_git_command(cmd, cwd=cmd_dir, stderr=subprocess.STDOUT)


def sync_directories(src_dir, dest_dir, ignore=None):
    """
    Syncs two directories so that the both contain the same content, with the exception of ignored files.
    """
    if ignore is None:
        ignore = []
    ignore.extend(CMP_IGNORE_FILES)

    dcmp = filecmp.dircmp(src_dir, dest_dir, ignore)
    _sync_directories_dircmp(dcmp)


def _sync_directories_dircmp(dcmp):
    # Remove files that only exist in the dest directory
    for filename in dcmp.right_only:
        right = os.path.join(dcmp.right, filename)
        if os.path.isfile(right):
            os.remove(right)
        else:
            shutil.rmtree(right)

    # Copy files that only exist in the source directory or files that have changed
    for filename in dcmp.left_only + dcmp.common_files:
        left = os.path.join(dcmp.left, filename)
        right = os.path.join(dcmp.right, filename)
        if os.path.isfile(left):
            shutil.copy2(left, right)
        else:
            shutil.copytree(left, right)

    # Sync sub directories
    for subdcmp in list(dcmp.subdirs.values()):
        _sync_directories_dircmp(subdcmp)


def commit_and_push_changes(git_dir, git_branch, git_upstream_branch):
    """
    Adds, commits and pushes any changes to a local git repository.
    """
    # Add all the changes
    add_cmd = ["git", "add", "--all"]
    subprocess.check_call(add_cmd, cwd=git_dir)
    try:
        # Commit the changes
        commit_cmd = [
            "git",
            "commit",
            "-m",
            "Merge branch 'upstream/" + git_upstream_branch + "' into " + git_branch,
            "--author",
            "CCS OSE Build Script <no-reply@redhat.com>",
        ]
        call_git_command(commit_cmd, cwd=git_dir, stderr=subprocess.STDOUT)
        # Push the changes
        push_cmd = ["git", "push"]
        call_git_command(push_cmd, cwd=git_dir, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        if e.output is None or "nothing to commit" not in e.output:
            raise


def parse_repo_config(config_file, distro, version):
    # Make sure the repo config file exists
    if not os.path.isfile(config_file):
        log.error("Failed loading the repo configuration from %s", config_file)
        sys.exit(-1)

    parser = configparser.SafeConfigParser()
    parser.read(config_file)

    repo_urls = dict()
    section_name = distro + "-" + version
    if parser.has_section(section_name):
        for (key, value) in parser.items(section_name):
            repo_urls[key] = value

    return repo_urls


def main():
    parser = setup_parser()
    args = parser.parse_args()
    logging.basicConfig(format="%(message)s", level=logging.INFO, stream=sys.stdout)

    # Copy down the latest files
    if not args.no_upstream_fetch:
        log.info("Fetching the upstream sources")
        fetch_sources(args.upstream_url, args.upstream_branch, clone_dirname=CLONE_DIR)

    config = find_build_config_file()
    src_dir = os.path.dirname(config)

    # Parse the build config
    data = parse_build_config(config)

    # Filter the list of books that should be built
    book_nodes = [node for node in data if check_node_distro_matches(node, args.distro)]

    # Make the new source tree
    dest_dir = os.path.join(os.getcwd(), "drupal-build", args.distro)
    if not args.no_clean:
        log.info("Cleaning the drupal-build directory")
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        os.makedirs(dest_dir)
    elif not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    info = {
        "title": args.title,
        "product-author": args.author,
        "product-version": args.version,
        "product": args.product,
        "distro": args.distro,
        "src_dir": src_dir,
        "dest_dir": dest_dir,
        "data": data,
        "book_nodes": book_nodes,
        "all_in_one": args.all_in_one,
        "preface-title": "",
        "upstream_branch": args.upstream_branch,
        "huge_book_dirs": []
    }

    # Build the master files
    log.info("Building the drupal files")
    build_master_files(info)

    # Copy the original data and reformat for drupal
    reformat_for_drupal(info)

    if list_of_errors:
        sys.exit(1)

    if args.push:
        # Parse the repo urls
        config_file = os.path.join(os.path.dirname(__file__), "repos.ini")
        repo_urls = parse_repo_config(config_file, args.distro, args.version)

        # Make sure the base git dire exists
        base_git_dir = os.path.join(os.getcwd(), "gitlab-repos")
        ensure_directory(base_git_dir)

        # Checkout the gitlab repo, copy the changes and push them back up
        for book_dir, gitlab_repo_url in list(repo_urls.items()):
            build_book_dir = os.path.join(dest_dir, book_dir)
            git_dirname = gitlab_repo_url.split("/")[-1].replace(".git", "")
            git_dir = os.path.join(base_git_dir, git_dirname)

            try:
                log.info("Fetching " + book_dir + " sources from GitLab")
                fetch_sources(gitlab_repo_url, args.branch, base_git_dir, git_dirname)

                log.info("Syncing " + book_dir)
                sync_directories(build_book_dir, git_dir, ["docinfo.xml"])

                log.info("Pushing " + book_dir + " changes back to GitLab")
                commit_and_push_changes(git_dir, args.branch, args.upstream_branch)
            except subprocess.CalledProcessError as e:
                if e.output:
                    sys.stdout.write(e.output)
                raise


if __name__ == "__main__":
    main()