openshift-docs/build.py

#!/usr/bin/python

import argparse
import ConfigParser
import filecmp
import fnmatch
import logging
import os
import re
import shutil
import subprocess
import sys
import time
import yaml

from aura import cli

cli.init_logging(False, True)

has_errors = False
CLONE_DIR = "."
BASE_PORTAL_URL = "https://access.redhat.com/documentation/en-us/"
# ID_RE = re.compile("^\[(?:\[|id=\'|#)(.*?)(\'?,.*?)?(?:\]|\')?\]", re.M | re.DOTALL)
ID_RE = re.compile("^\[(?:\[|id=\'|#|id=\")(.*?)(\'?,.*?)?(?:\]|\'|\")?\]", re.M | re.DOTALL)
LINKS_RE = re.compile("(?:xref|link):([\./\w_-]*/?[\w_.-]*\.(?:html|adoc))?(#[\w_-]*)?(\[.*?\])", re.M | re.DOTALL)
EXTERNAL_LINK_RE = re.compile("[\./]*([\w_-]+)/[\w_/-]*?([\w_.-]*\.(?:html|adoc))", re.DOTALL)
INCLUDE_RE = re.compile("include::(.*?)\[(.*?)\]", re.M)
IFDEF_RE = re.compile(r"^if(n?)def::(.*?)\[\]", re.M)
ENDIF_RE = re.compile(r"^endif::(.*?)\[\]\r?\n", re.M)
COMMENT_CONTENT_RE = re.compile(r"^^////$.*?^////$", re.M | re.DOTALL)
TAG_CONTENT_RE = re.compile(r"//\s+tag::(.*?)\[\].*?// end::(.*?)\[\]", re.M | re.DOTALL)
CMP_IGNORE_FILES = [".git", ".gitignore", "README.md", "build.cfg"]
DEVNULL = open(os.devnull, 'wb')


MASTER_FILE_BASE = "= {title}\n\
:product-author: {product-author}\n\
:product-title: {product}\n\
:product-version: {product-version}\n\
:{distro}:\n\
:imagesdir: images\n\
:idseparator: -\n\
{preface-title}\n"

DOCINFO_BASE = "<title>{title}</title>\n\
<productname>{{product-title}}</productname>\n\
<productnumber>{{product-version}}</productnumber>\n\
<subtitle>Enter a short description here.</subtitle>\n\
<abstract>\n\
    <para>A short overview and summary of the book's subject and purpose, traditionally no more than one paragraph long.</para>\n\
</abstract>\n\
<authorgroup>\n\
    <orgname>{product-author}</orgname>\n\
</authorgroup>\n\
<xi:include href=\"Common_Content/Legal_Notice.xml\" xmlns:xi=\"http://www.w3.org/2001/XInclude\" />\n"

# A list of book titles, that still use the old drupal url format (ie includes the product/version in the book title part)
# eg. openshift-enterprise/version-3.0/openshift-enterprise-30-getting-started vs openshift-enterprise/version-3.0/getting-started
DRUPAL_OLD_URL_TITLES = [
    "Administrator Guide",
    "Architecture",
    "CLI Reference",
    "Creating Images",
    "Developer Guide",
    "Getting Started",
    "REST API Reference",
    "Using Images",
    "What's New?"
]

# A mapping of upstream book/category names to CP book names
BOOK_NAME_OVERRIDES = {
    "Administration": "Administrator Guide"
}

# Lines that should be stripped out/ignored when cleaning the content
IGNORE_LINES = [
    "{product-author}\n",
    "{product-version}\n",
    "{product-version]\n",
    "{Lucas Costi}\n",
    "toc::[]\n"
]

# Each MACRO in this list is omitted from the output
# if the input appears as ':MACRO:' (colon, MACRO, colon).
IGNORE_MACROS = [
    "description",
    "keywords",
    "icons",
    "data-uri",
    "toc",
    "toc-title"
]

# Files where the title should be removed when building the all-in-one
ALL_IN_ONE_SCRAP_TITLE = [
    "welcome/index.adoc"
]

# Files that should be commented out in the toc structure
COMMENT_FILES = [
    "admin_guide/overview.adoc",
    "creating_images/overview.adoc",
    "dev_guide/overview.adoc",
    "using_images/overview.adoc",
    "rest_api/overview.adoc"
]

# Map FILENAME to a map of TITLE to ID.  In most of the cases the
# ID is the TITLE downcased, with "strange" chars replaced by hyphen.
# A notable exception is 'any' TITLE.
TITLE_IDS = {}
# A dictionary of existing dup ids to new unique ids
DUPLICATE_IDS = {}
# Map FILENAME to a map of BAD to GOOD.  Most of the time, BAD and GOOD
# are in link syntax, i.e., beginning with "link:", but not always.
INCORRECT_LINKS = {}

log = logging.getLogger("build")


def setup_parser():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--distro", help="The distribution to build for", default="openshift-enterprise")
    parser.add_argument("--all-in-one", help=argparse.SUPPRESS, action="store_true")
    parser.add_argument("--title", help=argparse.SUPPRESS, default="Documentation")
    parser.add_argument("--product", default="OpenShift Enterprise")
    parser.add_argument("--version", default="3.0")
    parser.add_argument("--author", default="Red Hat OpenShift Documentation Team")
    parser.add_argument("--upstream-url", help="The upstream source url", default="https://github.com/openshift/openshift-docs.git")
    parser.add_argument("--upstream-branch", help="The upstream source branch", default="enterprise-3.0")
    parser.add_argument("--branch", help="The GitLab branch to commit changes into", default="GA")
    parser.add_argument("-p", "--push", help="Commit and push the changes into GitLab", action="store_true")
    parser.add_argument("--no-clean", help="Don't clean the drupal-build directory before building", action="store_true")
    parser.add_argument("--no-upstream-fetch", help="Don't fetch the upstream sources", action="store_true")
    return parser


def find_build_config_file():
    """
    Finds the build config file to use, as it might be _topic_map.yml or _build_cfg.yml
    """
    config = os.path.abspath(os.path.join(CLONE_DIR, "_topic_map.yml"))
    if not os.path.isfile(config):
        config = os.path.abspath(os.path.join(CLONE_DIR, "_build_cfg.yml"))

    return config


def parse_build_config(config):
    """
    Parses the build config and returns a tree based structure for the config.
    """
    config = os.path.expanduser(config)
    with open(config, "r") as f:
        data = list(yaml.load_all(f))

    for book in data:
        book_name = book['Name']
        if book_name in BOOK_NAME_OVERRIDES:
            book['Name'] = BOOK_NAME_OVERRIDES[book_name]

    return data


def iter_tree(node, distro, dir_callback=None, topic_callback=None, include_path=True, parent_dir="", depth=0):
    """
    Iterates over a build config tree starting from a specifc node, skipping content where the distro doesn't match. Additionally calls are
    made to the dir_callback or topic_callback functions when a directory or topic is found.
    """
    if "Topics" in node:
        if check_node_distro_matches(node, distro):
            if include_path:
                topics_dir = os.path.join(parent_dir, node["Dir"])
            else:
                topics_dir = ""

            if dir_callback is not None:
                dir_callback(node, parent_dir, depth)

            for topic in node["Topics"]:
                iter_tree(topic, distro, dir_callback, topic_callback, True, topics_dir, depth + 1)
    elif check_node_distro_matches(node, distro):
        if topic_callback is not None:
            topic_callback(node, parent_dir, depth)


def check_node_distro_matches(node, distro):
    """
    Checks to see if the specified distro matches a distro in the nodes distros list. If there is no distros list specified on the
    node then all distros are allowed, so return true.
    """
    if "Distros" not in node:
        return True
    else:
        node_distros = [x.strip() for x in node['Distros'].split(",")]
        for node_distro in node_distros:
            # Check for an exact match, or a glob match
            if node_distro == distro or fnmatch.fnmatchcase(distro, node_distro):
                return True

    return False


def ensure_directory(directory):
    """
    Creates DIRECTORY if it does not exist.
    """
    if not os.path.exists(directory):
        os.mkdir(directory)


def build_master_files(info):
    """
    Builds the master.adoc and docinfo.xml files for each guide specified in the config.
    """
    dest_dir = info['dest_dir']

    all_in_one = info['all_in_one']
    all_in_one_text = ""
    for book in info['book_nodes']:
        book_dest_dir = os.path.join(dest_dir, book['Dir'])
        ensure_directory(book_dest_dir)

        book_info = dict(info)
        book_info['title'] = book['Name']

        master = generate_master_entry(book, book['Dir'], info['distro'], all_in_one, all_in_one=all_in_one)

        # Save the content
        if not all_in_one:
            master_file = os.path.join(book_dest_dir, 'master.adoc')
            docinfo_file = os.path.join(book_dest_dir, 'docinfo.xml')
            master_base = MASTER_FILE_BASE.format(**book_info)

            log.debug("Writing " + master_file)
            with open(master_file, "w") as f:
                f.write(master_base + master)
            log.debug("Writing " + docinfo_file)
            with open(docinfo_file, "w") as f:
                f.write(DOCINFO_BASE.format(**book_info))
        else:
            if all_in_one_text == "":
                # Remove the title for the first file in the book
                master = master.replace("= " + book['Name'] + "\n", "")

                # Set the preface title from the first file in the book
                first_file = os.path.join(info['src_dir'], book['Dir'], book['Topics'][0]['File'] + ".adoc")
                preface_title = None
                with open(first_file, "r") as f:
                    line = f.readline()
                    while line:
                        if include_line(line):
                            preface_title = re.sub("^=+ ", "", line)
                            break
                        line = f.readline()
                if preface_title is not None:
                    info['preface-title'] = ":preface-title: " + preface_title
            all_in_one_text += master

    if all_in_one:
        master_file = os.path.join(dest_dir, 'master.adoc')
        docinfo_file = os.path.join(dest_dir, 'docinfo.xml')

        master_base = MASTER_FILE_BASE.format(**info)

        log.debug("Writing " + master_file)
        with open(master_file, "w") as f:
            f.write(master_base + all_in_one_text)
        log.debug("Writing " + docinfo_file)
        with open(docinfo_file, "w") as f:
            f.write(DOCINFO_BASE.format(**info))


def generate_master_entry(node, book_dir, distro, include_name=True, all_in_one=False):
    """
    Generates the master.adoc core content for a specific book/node.
    """
    master_entries = []

    def dir_callback(dir_node, parent_dir, depth):
            if include_name or depth > 0:
                master_entries.append("=" * (depth + 1) + " " + dir_node["Name"].replace("\\", ""))

    def topic_callback(topic_node, parent_dir, depth):
        book_file_path = os.path.join(parent_dir, topic_node["File"] + ".adoc")
        file_path = os.path.join(book_dir, book_file_path)
        include = "include::" + book_file_path + "[leveloffset=+" + str(depth) + "]"
        if not all_in_one and file_path in COMMENT_FILES:
            master_entries.append("////")
            master_entries.append(include)
            master_entries.append("////")
        else:
            master_entries.append(include)
        # Add a blank line
        master_entries.append("")

    # Iterate over the tree and build the master.adoc content
    iter_tree(node, distro, dir_callback, topic_callback, include_name)
    return "\n".join(master_entries)


def reformat_for_drupal(info):
    """
    Reformats the source content for use in the Customer Portal. This function does the following:

    - Copies images over and flattens them into a single dir
    - Copies source asciidoc over
    - Filters the AsciiDoc source to remove duplicate macro definitions, that should only be in the main file.
    - Adds id's for each file, so the files can be properly cross referenced.
    - Adds id's to sections that are cross referenced, but have no id.
    - Fixes duplicate id's in the source content.
    - Fixes links that have been done incorrectly and should be cross references instead.
    """
    books = info['book_nodes']
    src_dir = info['src_dir']
    dest_dir = info['dest_dir']
    distro = info['distro']

    # Build a mapping of files to ids
    # Note: For all-in-one we have to collect ids from all books first
    file_to_id_map = {}
    if info['all_in_one']:
        book_ids = []
        for book in books:
            book_ids.extend(collect_existing_ids(book, distro, src_dir))
        for book in books:
            file_to_id_map.update(build_file_to_id_map(book, distro, book_ids, src_dir))
    else:
        for book in books:
            book_ids = collect_existing_ids(book, distro, src_dir)
            file_to_id_map.update(build_file_to_id_map(book, distro, book_ids, src_dir))
    info['file_to_id_map'] = file_to_id_map

    # Reformat the data
    for book in books:
        log.info("Processing %s", book['Dir'])
        book_src_dir = os.path.join(src_dir, book['Dir'])

        if info['all_in_one']:
            images_dir = os.path.join(dest_dir, "images")
        else:
            book_dest_dir = os.path.join(dest_dir, book['Dir'])
            images_dir = os.path.join(book_dest_dir, "images")

        ensure_directory(images_dir)

        log.debug("Copying source files for " + book['Name'])
        copy_files(book, book_src_dir, src_dir, dest_dir, info)

        log.debug("Copying images for " + book['Name'])
        copy_images(book, src_dir, images_dir, distro)


def copy_images(node, src_path, dest_dir, distro):
    """
    Copy images over to the destination directory and flatten all image directories into the one top level dir.
    """
    def dir_callback(dir_node, parent_dir, depth):
        node_dir = os.path.join(parent_dir, dir_node['Dir'])
        src = os.path.join(node_dir, "images")

        if os.path.exists(src):
            src_files = os.listdir(src)
            for src_file in src_files:
                shutil.copy(os.path.join(src, src_file), dest_dir)

    iter_tree(node, distro, dir_callback, parent_dir=src_path)


def copy_files(node, book_src_dir, src_dir, dest_dir, info):
    """
    Recursively copy files from the source directory to the destination directory, making sure to scrub the content, add id's where the
    content is referenced elsewhere and fix any links that should be cross references.
    """
    def dir_callback(dir_node, parent_dir, depth):
        node_dest_dir = os.path.join(dest_dir, parent_dir, dir_node['Dir'])
        ensure_directory(node_dest_dir)

    def topic_callback(topic_node, parent_dir, depth):
        node_src_dir = os.path.join(src_dir, parent_dir)
        node_dest_dir = os.path.join(dest_dir, parent_dir)

        src_file = os.path.join(node_src_dir, topic_node["File"] + ".adoc")
        dest_file = os.path.join(node_dest_dir, topic_node["File"] + ".adoc")

        # Copy the file
        copy_file(info, book_src_dir, src_file, dest_dir, dest_file)

    iter_tree(node, info['distro'], dir_callback, topic_callback)


def copy_file(info, book_src_dir, src_file, dest_dir, dest_file, include_check=True, tag=None, cwd=None):
    """
    Copies a source file to destination, making sure to scrub the content, add id's where the content is referenced elsewhere and fix any
    links that should be cross references. Also copies any includes that are referenced, since they aren't included in _build_cfg.yml.
    """
    # It's possible that the file might have been created by another include, if so then just return
    if os.path.isfile(dest_file):
        return

    # Touch the dest file, so we can handle circular includes
    parent_dir = os.path.dirname(dest_file)
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)
    #os.mknod(dest_file)
    open(dest_file, 'w').close()
    # Scrub/fix the content
    content = scrub_file(info, book_src_dir, src_file, tag=tag, cwd=cwd)

    # Check for any includes
    if include_check:
        cleaned_content = remove_conditional_content(content, info)
        include_iter = INCLUDE_RE.finditer(cleaned_content)
        for include in include_iter:
            include_text = include.group(0)
            include_path = include.group(1)
            include_unparsed_vars = include.group(2)

            # Determine the include vars
            include_vars = {}
            if include_unparsed_vars is not None and len(include_unparsed_vars) > 0:
                for meta in re.split(r"\s*,\s*", include_unparsed_vars):
                    key, value = re.split("\s*=\s*", meta, 2)
                    include_vars[key] = value

            # Determine the include src/dest paths
            include_file = os.path.join(os.path.dirname(book_src_dir), include_path)
            relative_path = os.path.relpath(include_file, os.path.dirname(src_file))

            # If the path is in another book, copy it into this one
            relative_book_path = os.path.relpath(include_file, book_src_dir)
            if relative_book_path.startswith("../"):
                path, src_book_name = os.path.split(book_src_dir)
                dest_include_dir = os.path.join(dest_dir, src_book_name, "includes")
                relative_path = os.path.join(os.path.relpath(dest_include_dir, parent_dir), os.path.basename(include_file))
            else:
                dest_include_dir = os.path.abspath(os.path.join(os.path.dirname(dest_file), os.path.dirname(relative_path)))
            dest_include_file = os.path.join(dest_include_dir, os.path.basename(include_file))

            # Make sure we have a reference to the current working dir
            current_dir = cwd or os.path.dirname(src_file)
            include_tag = include_vars.get("tag", None)

            # Copy the file and fix the content
            if not os.path.isfile(dest_include_file):
                copy_file(info, book_src_dir, include_file, dest_dir, dest_include_file, tag=include_tag, cwd=current_dir)
            else:
                # The file has already been copied, so just fix the links for this tag
                with open(dest_include_file, 'r') as f:
                    include_content = f.read()

                # Fix any links
                include_content = fix_links(include_content, info, book_src_dir, include_file, tag=include_tag, cwd=cwd)

                with open(dest_include_file, "w") as f:
                    f.write(include_content)

            content = content.replace(include_text, include.expand("include::" + relative_path + "[\\2]"))

    with open(dest_file, "w") as f:
        f.write(content)


def scrub_file(info, book_src_dir, src_file, tag=None, cwd=None):
    """
    Scrubs a file and returns the cleaned file contents.
    """
    base_src_file = src_file.replace(info['src_dir'] + "/", "")

    # Get a list of predefined custom title ids for the file
    title_ids = TITLE_IDS.get(base_src_file, {})

    # Read in the source content
    with open(src_file, 'r') as f:
        src_file_content = f.readlines()

    # Scrub the content
    content = ""
    header_found = content_found = False
    current_id = None
    for line in src_file_content:
        # Ignore any leading blank lines, before any meaningful content is found
        if line.strip() == "" and not content_found:
            continue

        # Check if the line should be included in the output
        if include_line(line):
            content_found = True

            # Setup the document header content/id
            if not header_found and line.strip() != "" and line.startswith("="):
                header_found = True

                if info['all_in_one'] and base_src_file in ALL_IN_ONE_SCRAP_TITLE and line.startswith("= "):
                    continue
                # Add a section id if one doesn't exist, so we have something to link to
                elif current_id is None and src_file in info['file_to_id_map']:
                    file_id = info['file_to_id_map'][src_file]
                    content += "[[" + file_id + "]]\n"
            # Add a custom title id, if one is needed
            elif line.startswith("=") and current_id is None:
                for title in title_ids:
                    title_re = r"^=+ " + title.replace(".", "\\.").replace("?", "\\?") + "( (anchor|\[).*?)?(\n)?$"
                    if re.match(title_re, line):
                        content += "[[" + title_ids[title] + "]]\n"

            # Set the current id based on the line content
            if current_id is None and ID_RE.match(line.strip()):
                current_id = line.strip()
            elif current_id is not None and line.strip != "":
                current_id = None

            # Add the line to the processed content
            content += line

    # Fix up any duplicate ids
    if base_src_file in DUPLICATE_IDS:
        for duplicate_id, new_id in DUPLICATE_IDS[base_src_file].items():
            content = content.replace("[[" + duplicate_id + "]]", "[[" + new_id + "]]")

    # Replace incorrect links with correct ones
    if base_src_file in INCORRECT_LINKS:
        for incorrect_link, fixed_link in INCORRECT_LINKS[base_src_file].items():
            content = content.replace(incorrect_link, fixed_link)

    # Fix up the links
    content = fix_links(content, info, book_src_dir, src_file, tag=tag, cwd=cwd)

    return content


def include_line(line):
    """
    Determines if a line should be included in the filtered output.
    """
    if line in IGNORE_LINES:
        return False

    for macro in IGNORE_MACROS:
        if line.startswith(":" + macro + ":"):
            return False

    return True


def fix_links(content, info, book_src_dir, src_file, tag=None, cwd=None):
    """
    Fix any links that were done incorrectly and reference the output instead of the source content.
    """
    if info['all_in_one']:
        content = fix_links(content, info['src_dir'], src_file, info)
    else:
        # Determine if the tag should be passed when fixing the links. If it's in the same book, then process the entire file. If it's
        # outside the book then don't process it.
        if book_src_dir in src_file:
            content = _fix_links(content, book_src_dir, src_file, info, cwd=cwd)
        else:
            content = _fix_links(content, book_src_dir, src_file, info, tag=tag, cwd=cwd)

    return content


def _fix_links(content, book_dir, src_file, info, tag=None, cwd=None):
    """
    Fix any links that were done incorrectly and reference the output instead of the source content.
    """
    # TODO Deal with xref so that they keep the proper path. Atm it'll just strip the path and leave only the id
    file_to_id_map = info['file_to_id_map']
    current_dir = cwd or os.path.dirname(src_file)
    cleaned_content = remove_conditional_content(content, info, tag=tag)
    links = LINKS_RE.finditer(cleaned_content)

    for link in links:
        link_text = link.group(0)
        link_file = link.group(1)
        link_anchor = link.group(2)
        link_title = link.group(3)

        if link_file is not None:
            fixed_link_file = link_file.replace(".html", ".adoc")
            fixed_link_file_abs = os.path.abspath(os.path.join(current_dir, fixed_link_file))
            if fixed_link_file_abs in file_to_id_map:
                if fixed_link_file_abs.startswith(book_dir + os.sep) or fixed_link_file_abs == src_file:
                    # We are dealing with a cross reference within the same book here
                    if link_anchor is None:
                        # Cross reference to the top of a topic, without an id being specified
                        link_anchor = "#" + file_to_id_map[fixed_link_file_abs]

                    fixed_link = "xref:" + link_anchor.replace("#", "") + link_title
                else:
                    # We are dealing with a cross reference to another book here
                    external_link = EXTERNAL_LINK_RE.search(link_file)
                    book_dir_name = external_link.group(1)

                    # Find the book name
                    book_name = book_dir_name
                    for book in info['data']:
                        if check_node_distro_matches(book, info['distro']) and book['Dir'] == book_dir_name:
                            book_name = book['Name']
                            break

                    fixed_link_file = BASE_PORTAL_URL + build_portal_url(info, book_name)

                    if link_anchor is None:
                        fixed_link = "link:" + fixed_link_file + "#" + file_to_id_map[fixed_link_file_abs] + link_title
                    else:
                        fixed_link = "link:" + fixed_link_file + link_anchor + link_title
            else:
                # Cross reference or link that isn't in the docs suite
                fixed_link = link_text
                if EXTERNAL_LINK_RE.search(link_file) is not None:
                    rel_src_file = src_file.replace(os.path.dirname(book_dir) + "/", "")
                    has_errors = True
                    log.error("ERROR (%s): \"%s\" appears to try to reference a file not included in the \"%s\" distro", rel_src_file, link_text.replace("\n", ""), info['distro'])
        else:
            fixed_link = "xref:" + link_anchor.replace("#", "") + link_title

        content = content.replace(link_text, fixed_link)

    return content


def remove_conditional_content(content, info, tag=None):
    """
    Removes any conditional content that doesn't match for the specified distro
    """
    # Remove any ifdef content
    ifdef = IFDEF_RE.search(content)
    while ifdef is not None:
        is_not_def = ifdef.group(1) == "n"
        ifdef_distros = ifdef.group(2).split(",")
        pos = ifdef.start()
        end = ifdef.end()

        # Determine if we should strip the conditional content, based on the distro
        strip_content = False
        if is_not_def and info['distro'] in ifdef_distros:
            strip_content = True
        elif not is_not_def and info['distro'] not in ifdef_distros:
            strip_content = True

        # Remove the conditional content
        if strip_content:
            # Find the correct endif for the current ifdef
            search_pos = end
            endpos = len(content)
            while True:
                next_ifdef = IFDEF_RE.search(content, search_pos)
                endif = ENDIF_RE.search(content, search_pos)

                if not endif:
                    break
                elif not next_ifdef or next_ifdef.start() > endif.start():
                    endpos = endif.end()
                    break
                else:
                    search_pos = endif.end()

            # Replace the content and move the end pos to be the same as the start since the content was removed
            ifdef_text = content[pos:endpos]
            content = content.replace(ifdef_text, "")
            end = pos

        # Move onto the next ifdef
        ifdef = IFDEF_RE.search(content, end)

    # Remove commented out content
    for comment in COMMENT_CONTENT_RE.finditer(content):
        content = content.replace(comment.group(0), "")

    # Remove content outside of tags
    if tag is not None:
        for tag_match in TAG_CONTENT_RE.finditer(content):
            tag_text = tag_match.group(0)
            tag_label = tag_match.group(1)
            if tag_label == tag:
                # Tag matches, so only use the content in the tag
                content = tag_text

    return content


def collect_existing_ids(node, distro, path):
    """
    Examines all nodes asciidoc file contents and returns any existing ids.
    """
    book_ids = []

    def topic_callback(topic_node, parent_dir, depth):
        src_file = os.path.join(parent_dir, topic_node["File"] + ".adoc")
        file_ids = extract_file_ids(src_file)
        book_ids.extend(file_ids)

    iter_tree(node, distro, topic_callback=topic_callback, parent_dir=path)

    return book_ids


def build_file_to_id_map(node, distro, existing_ids, path=""):
    """
    Builds a mapping of file names/paths to the root id for the file. This is used to fix the links that are done incorrectly.
    """
    file_to_id_map = {}

    def topic_callback(topic_node, parent_dir, depth):
        src_file = os.path.join(parent_dir, topic_node["File"] + ".adoc")
        file_to_id_map[src_file] = build_file_id(topic_node["Name"], file_to_id_map, existing_ids)

    iter_tree(node, distro, topic_callback=topic_callback, parent_dir=path)
    return file_to_id_map


def extract_file_ids(file_path):
    """
    Extracts all the ids used in the specified file.
    """
    with open(file_path, "r") as f:
        content = f.read()

    ids = ID_RE.finditer(content)
    return [id.group(1) for id in ids]


def build_file_id(file_title, file_to_id_map, existing_ids):
    """
    Generates a unique id for a file, based on it's title.
    """
    file_id = base_id = re.sub(r"[\[\]\(\)#]", "", file_title.lower().replace("_", "-").replace(" ", "-"))
    count = 1
    while file_id in existing_ids or file_id in file_to_id_map.values():
        file_id = base_id + "-" + str(count)
        count += 1

    return file_id


def build_portal_url(info, book_name):
    """
    Builds a portal url path by escaping the content in the same way drupal does.
    """
    product = info['product']
    version = info['product-version']

    return generate_url_from_name(product) + "/" + generate_url_from_name(version) + "/html-single/" + generate_url_from_name(book_name) + "/"


def replace_nbsp(val):
    """Replaces non breaking spaces with a regular space"""
    if val is not None:
        # Check if the string is unicode
        if isinstance(val, unicode):
            return val.replace(u'\xa0', ' ')
        else:
            return val.replace('\xc2\xa0', ' ')
    else:
        return None


def generate_url_from_name(name, delimiter='_'):
    """
    Generates a url fragment from a product, version or titles name.
    """
    # Remove characters that aren't allowed in urls
    url = re.sub("^\.+|[^0-9a-zA-Z _\-.]+", "", replace_nbsp(name))
    # Replace spaces with the delimiter
    url = re.sub("\s+", delimiter, url)
    # Replace multiple underscores with a single underscore
    url = re.sub(delimiter + "+", delimiter, url)
    return url.lower()


def call_git_command(*args, **kwargs):
    """
    Calls a git command and retries the command if it is unable to connect to the remote repo
    """
    retries = kwargs.pop("retries", 3)
    try:
        output = subprocess.check_output(*args, **kwargs)
        if output is not None:
            sys.stdout.write(output)
        return output
    except subprocess.CalledProcessError as e:
        retries -= 1
        if retries > 0 and "fatal: Could not read from remote repository" in e.output:
            # Connection failed, so wait a couple of secs and try again
            time.sleep(2)
            call_git_command(*args, retries=retries, **kwargs)
        else:
            raise


def fetch_sources(url, branch, dir=None, clone_dirname=None):
    """
    Fetches sources from a git repository. If the repository doesn't exist it'll be cloned into `dir_name`, otherwise if it already has been
    cloned, the repo will just be updated.
    """
    # Setup the defaults
    if dir is None:
        dir = os.getcwd()
    if clone_dirname is None:
        clone_dirname = url.split('/')[-1].replace(".git", "")

    # If the dir already exists update the content, otherwise clone it
    clone_dir = os.path.abspath(os.path.join(dir, clone_dirname))
    if os.path.exists(os.path.join(clone_dir, ".git")):
        cmd = ["git", "pull", "-f"]
        cmd_dir = clone_dir

        # Do a checkout to make sure we are on the right branch
        checkout_cmd = ["git", "checkout", branch]
        subprocess.check_output(checkout_cmd, cwd=cmd_dir, stderr=subprocess.STDOUT)
    else:
        cmd = ["git", "clone", "-b", branch, url, clone_dirname]
        cmd_dir = os.path.abspath(dir)

    # Execute the command
    call_git_command(cmd, cwd=cmd_dir, stderr=subprocess.STDOUT)


def sync_directories(src_dir, dest_dir, ignore=None):
    """
    Syncs two directories so that the both contain the same content, with the exception of ignored files.
    """
    if ignore is None:
        ignore = []
    ignore.extend(CMP_IGNORE_FILES)

    dcmp = filecmp.dircmp(src_dir, dest_dir, ignore)
    _sync_directories_dircmp(dcmp)


def _sync_directories_dircmp(dcmp):
    # Remove files that only exist in the dest directory
    for filename in dcmp.right_only:
        right = os.path.join(dcmp.right, filename)
        if os.path.isfile(right):
            os.remove(right)
        else:
            shutil.rmtree(right)

    # Copy files that only exist in the source directory or files that have changed
    for filename in dcmp.left_only+dcmp.common_files:
        left = os.path.join(dcmp.left, filename)
        right = os.path.join(dcmp.right, filename)
        if os.path.isfile(left):
            shutil.copy2(left, right)
        else:
            shutil.copytree(left, right)

    # Sync sub directories
    for subdcmp in dcmp.subdirs.values():
        _sync_directories_dircmp(subdcmp)


def commit_and_push_changes(git_dir, git_branch, git_upstream_branch):
    """
    Adds, commits and pushes any changes to a local git repository.
    """
    # Add all the changes
    add_cmd = ["git", "add", "--all"]
    subprocess.check_call(add_cmd, cwd=git_dir)
    try:
        # Commit the changes
        commit_cmd = ["git", "commit", "-m", "Merge branch 'upstream/" + git_upstream_branch + "' into " + git_branch,
                      "--author", "CCS OSE Build Script <no-reply@redhat.com>"]
        call_git_command(commit_cmd, cwd=git_dir, stderr=subprocess.STDOUT)
        # Push the changes
        push_cmd = ["git", "push"]
        call_git_command(push_cmd, cwd=git_dir, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        if e.output is None or "nothing to commit" not in e.output:
            raise


def parse_repo_config(config_file, distro, version):
    # Make sure the repo config file exists
    if not os.path.isfile(config_file):
        log.error("Failed loading the repo configuration from %s", config_file)
        sys.exit(-1)

    parser = ConfigParser.SafeConfigParser()
    parser.read(config_file)

    repo_urls = dict()
    section_name = distro + "-" + version
    if parser.has_section(section_name):
        for (key, value) in parser.items(section_name):
            repo_urls[key] = value

    return repo_urls


def main():
    parser = setup_parser()
    args = parser.parse_args()
    logging.basicConfig(format='%(message)s', level=logging.INFO, stream=sys.stdout)

    # Copy down the latest files
    if not args.no_upstream_fetch:
        log.info("Fetching the upstream sources")
        fetch_sources(args.upstream_url, args.upstream_branch, clone_dirname=CLONE_DIR)

    config = find_build_config_file()
    src_dir = os.path.dirname(config)

    # Parse the build config
    data = parse_build_config(config)

    # Filter the list of books that should be built
    book_nodes = [node for node in data if check_node_distro_matches(node, args.distro)]

    # Make the new source tree
    dest_dir = os.path.join(os.getcwd(), "drupal-build", args.distro)
    if not args.no_clean:
        log.info("Cleaning the drupal-build directory")
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        os.makedirs(dest_dir)
    elif not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    info = {
        'title': args.title,
        'product-author': args.author,
        'product-version': args.version,
        'product': args.product,
        'distro': args.distro,
        'src_dir': src_dir,
        'dest_dir': dest_dir,
        'data': data,
        'book_nodes': book_nodes,
        'all_in_one': args.all_in_one,
        'preface-title': "",
        "upstream_branch": args.upstream_branch
    }

    # Build the master files
    log.info("Building the drupal files")
    build_master_files(info)

    # Copy the original data and reformat for drupal
    reformat_for_drupal(info)

    if has_errors:
        sys.exit(1)

    if args.push:
        # Parse the repo urls
        config_file = os.path.join(os.path.dirname(__file__), 'repos.ini')
        repo_urls = parse_repo_config(config_file, args.distro, args.version)

        # Make sure the base git dire exists
        base_git_dir = os.path.join(os.getcwd(), "gitlab-repos")
        ensure_directory(base_git_dir)

        # Checkout the gitlab repo, copy the changes and push them back up
        for book_dir, gitlab_repo_url in repo_urls.items():
            build_book_dir = os.path.join(dest_dir, book_dir)
            git_dirname = gitlab_repo_url.split('/')[-1].replace(".git", "")
            git_dir = os.path.join(base_git_dir, git_dirname)

            try:
                log.info("Fetching " + book_dir + " sources from GitLab")
                fetch_sources(gitlab_repo_url, args.branch, base_git_dir, git_dirname)

                log.info("Syncing " + book_dir)
                sync_directories(build_book_dir, git_dir, ["docinfo.xml"])

                log.info("Pushing " + book_dir + " changes back to GitLab")
                commit_and_push_changes(git_dir, args.branch, args.upstream_branch)
            except subprocess.CalledProcessError as e:
                if e.output:
                    sys.stdout.write(e.output)
                raise

if __name__ == "__main__":
    main()