Extract textblocks from Confluence cloud using a python api¶

The purpose of this script is to export textblocks from Confluence in order to provide them as building blocks for online application forms.

Textblocks are included into application forms by using a macro that takes the exported textblocks as input.

Get textblock data¶

Keep your Confluence configuration secret¶

This script assumes that its configuration is specified in a config.json file with the following content:

{
    "domain": "your_domain",
    "username": "your_username",
    "api_token": "your_api_token",
    "output_dir": "your_output_directory",
    "li_symbol": "your_symbol_for_list_elements"
}

So you can avoid to specify your confidential Confluence configuration in this script.

Please make sure that your config.json is kept secret, e.g. by adding it to your .gitignore:

# Inside .gitignore

config.json

Apart from the Confluence login credentials config.json may also contain other configuration parameters, such as

  • the path to the output directory (e.g. "./")
  • the symbol used to prefix list elements in plain text output (e.g. "-")
In [ ]:
# Read config parameters
import json

with open('config.json', 'r') as f:
    config = json.load(f)
In [ ]:
# Connect to Confluence cloud

from atlassian import Confluence

confluence = Confluence(
    url = config['domain'],
    username = config['username'],
    password = config['api_token'],
    cloud = True)
In [ ]:
# Read textblock content from Confluence

import bs4
from bs4 import BeautifulSoup
import textwrap as tr

# Optional: improve readability of formatted html by customizing identation (default indent = 1)
formatter = bs4.formatter.HTMLFormatter(indent = 4)

# Html elements that trigger a newline when converted to plain text
elem_newline = ['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5']

# For plain text: define prefix used for list elements
li_prefix = ""
if config['li_symbol'] != "":
    li_prefix = config['li_symbol'] + " "
inside_li = False
prefix_used = False

# Extract textblock titles and contents
tb_data = []

tb_pages = confluence.get_all_pages_by_label('textblock', start=0, limit=50)

print(f'Extracting {len(tb_pages)} textblocks from Confluence')

for p in tb_pages:
    tb_id = ''
    title = ''
    content = ''
    content_text = ''
    old_string = ''

    page = confluence.get_page_by_id(p['id'], expand = 'body.storage', status = None, version = None)

    # Process page title
    title = page['title']
    # Split title at first occurence of space 
    tb_id, title = title.split(' ', 1)

    # Get textblock content
    page_body = page['body']['storage']['value']
    soup = BeautifulSoup(page_body, 'html.parser')
    # The textblock content should be in the last ac:structured-macro of a page (index -1)
    content = soup('ac:structured-macro')[-1]('ac:rich-text-body')[0]

    # Prepare textblock content for output
    # Replace 'ac:rich-text-body' tag name with 'div'
    content.name = 'div'
    # Add attributes to div tag
    content['tb_id'] = tb_id
    content['title'] = title

    # Convert content to formatted html
    content_html = str(content)
    # Optional: prettify html content
    #content_html = content.prettify(formatter = formatter)

    # Convert content to plain text
    # The last element has to be treated differently: no trailing newline
    elem_max = len(list(content.descendants)) - 1
    elem_counter = 0
    # Iterate recursively over content elements
    for elem in content.descendants:
        elem_counter += 1
        # Suppress duplicate strings
        if elem.string and elem.string == old_string:
            old_string = elem.string
            continue    # proceed to next iteration

        # Process lists
        if elem.name in ['ul', 'ol']:
            for li in elem.children:
                content_text += li_prefix
                # Add (recursive) content of li element
                for li_string in li.strings:
                    content_text += li_string
                if elem_counter < elem_max:
                    content_text += "\n"

        # Process everything else
        if (elem.find_parent('ul') is None and 
                elem.find_parent('ol') is None):
            # elem is not part of a list
            if elem.string:
                content_text += elem.string
                if elem.name in elem_newline and elem_counter < elem_max:
                    content_text += "\n"

        old_string = elem.string

    tb_data.append([tb_id, title, content_html, content_text])
Extracting 7 textblocks from Confluence
In [ ]:
# Create ts file from textblock data

# Comment line
comment_line = "// GENERATED CODE: DO NOT CHANGE MANUALLY!\n\n"

# Beginning of ts dictionary definition
type_def = "export const texts: { [key: string]: string } =\n"

# Create a dictionary with key = tb_id and value = content_html via list comprehension
dict_tb_html = {tb[0]: tb[2] for tb in tb_data}
# Create a dictionary with key = tb_id extended with "_plain" and value = content_text via list comprehension
dict_tb_plain = {tb[0] + '_plain': tb[3] for tb in tb_data}
# Merge the two dictionaries
dict_tb = {**dict_tb_html, **dict_tb_plain} 

# Serialize dictionary content
dict_content = json.dumps(
                    dict_tb, 
                    sort_keys = True, 
                    indent = 0,
                    separators = (',\n', ':\n'))

# Build complete content of type definition file
type_def = comment_line + type_def + dict_content
In [ ]:
import re

# Convert plain text strings into multiline string
transformed_def = []
counter = 0
plain_text_counter = 0
type_def_lines = type_def.split("\n")
for line in type_def_lines:

    plain_lines = []
    counter += 1
    if "_plain" in line:
        # The current line contains the plain text key
        # So the next line will contain the plain text string
        plain_text_counter = counter + 1
    if plain_text_counter == counter:
        # Convert plain text string into multiline string
        plain_lines = line.split("\\n")
        p_lines = []
        for p_line in plain_lines:
            # Replace leading and trailing quote with backtick
            p_line = re.sub('^"|"$', '`', p_line)
            # Replace quote before final comma with backtick
            p_line = re.sub('",$', '`,', p_line)
            p_lines.append(p_line)
        # Concatenate the plain text lines with newline as separator
        line = '\n'.join(p_lines)

    transformed_def.append(line)

# Concatenate all type definition lines with newline as separator
type_def = "\n".join(transformed_def)

Write textblocks to typescript dictionary definition¶

In [ ]:
# Write ts file

ts_file_name = config['output_dir'] + 'texts.ts'

print(f'Writing {ts_file_name}')

ts_file = open(ts_file_name, 'w')
ts_file.write(type_def)
ts_file.close()
Writing ./texts.ts

Write textblocks to Excel file¶

In [ ]:
# Write textblock data to Excel file

import pandas as pd

# Create dataframe from textblock data
df_tb = pd.DataFrame(tb_data, columns=['tb_id', 'title', 'content_html', 'content_text'])

# Write dataframe to Excel file
result_file = config['output_dir'] + 'tb_data.xlsx'

print(f'Writing {result_file}')

df_tb.to_excel(result_file, sheet_name = 'textblocks', index = None)
Writing ./tb_data.xlsx