# Read config parameters
import json

with open('config.json', 'r') as f:
    config = json.load(f)


# Connect to Confluence cloud

from atlassian import Confluence

confluence = Confluence(
    url = config['domain'],
    username = config['username'],
    password = config['api_token'],
    cloud = True)


# Read textblock content from Confluence

import bs4
from bs4 import BeautifulSoup
import textwrap as tr

# Optional: improve readability of formatted html by customizing identation (default indent = 1)
formatter = bs4.formatter.HTMLFormatter(indent = 4)

# Html elements that trigger a newline when converted to plain text
elem_newline = ['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5']

# For plain text: define prefix used for list elements
li_prefix = ""
if config['li_symbol'] != "":
    li_prefix = config['li_symbol'] + " "
inside_li = False
prefix_used = False

# Extract textblock titles and contents
tb_data = []

tb_pages = confluence.get_all_pages_by_label('textblock', start=0, limit=50)

print(f'Extracting {len(tb_pages)} textblocks from Confluence')

for p in tb_pages:
    tb_id = ''
    title = ''
    content = ''
    content_text = ''
    old_string = ''

    page = confluence.get_page_by_id(p['id'], expand = 'body.storage', status = None, version = None)

    # Process page title
    title = page['title']
    # Split title at first occurence of space 
    tb_id, title = title.split(' ', 1)

    # Get textblock content
    page_body = page['body']['storage']['value']
    soup = BeautifulSoup(page_body, 'html.parser')
    # The textblock content should be in the last ac:structured-macro of a page (index -1)
    content = soup('ac:structured-macro')[-1]('ac:rich-text-body')[0]

    # Prepare textblock content for output
    # Replace 'ac:rich-text-body' tag name with 'div'
    content.name = 'div'
    # Add attributes to div tag
    content['tb_id'] = tb_id
    content['title'] = title

    # Convert content to formatted html
    content_html = str(content)
    # Optional: prettify html content
    #content_html = content.prettify(formatter = formatter)

    # Convert content to plain text
    # The last element has to be treated differently: no trailing newline
    elem_max = len(list(content.descendants)) - 1
    elem_counter = 0
    # Iterate recursively over content elements
    for elem in content.descendants:
        elem_counter += 1
        # Suppress duplicate strings
        if elem.string and elem.string == old_string:
            old_string = elem.string
            continue    # proceed to next iteration

        # Process lists
        if elem.name in ['ul', 'ol']:
            for li in elem.children:
                content_text += li_prefix
                # Add (recursive) content of li element
                for li_string in li.strings:
                    content_text += li_string
                if elem_counter < elem_max:
                    content_text += "\n"

        # Process everything else
        if (elem.find_parent('ul') is None and 
                elem.find_parent('ol') is None):
            # elem is not part of a list
            if elem.string:
                content_text += elem.string
                if elem.name in elem_newline and elem_counter < elem_max:
                    content_text += "\n"

        old_string = elem.string

    tb_data.append([tb_id, title, content_html, content_text])

Extracting 7 textblocks from Confluence


# Create ts file from textblock data

# Comment line
comment_line = "// GENERATED CODE: DO NOT CHANGE MANUALLY!\n\n"

# Beginning of ts dictionary definition
type_def = "export const texts: { [key: string]: string } =\n"

# Create a dictionary with key = tb_id and value = content_html via list comprehension
dict_tb_html = {tb[0]: tb[2] for tb in tb_data}
# Create a dictionary with key = tb_id extended with "_plain" and value = content_text via list comprehension
dict_tb_plain = {tb[0] + '_plain': tb[3] for tb in tb_data}
# Merge the two dictionaries
dict_tb = {**dict_tb_html, **dict_tb_plain} 

# Serialize dictionary content
dict_content = json.dumps(
                    dict_tb, 
                    sort_keys = True, 
                    indent = 0,
                    separators = (',\n', ':\n'))

# Build complete content of type definition file
type_def = comment_line + type_def + dict_content


import re

# Convert plain text strings into multiline string
transformed_def = []
counter = 0
plain_text_counter = 0
type_def_lines = type_def.split("\n")
for line in type_def_lines:

    plain_lines = []
    counter += 1
    if "_plain" in line:
        # The current line contains the plain text key
        # So the next line will contain the plain text string
        plain_text_counter = counter + 1
    if plain_text_counter == counter:
        # Convert plain text string into multiline string
        plain_lines = line.split("\\n")
        p_lines = []
        for p_line in plain_lines:
            # Replace leading and trailing quote with backtick
            p_line = re.sub('^"|"$', '`', p_line)
            # Replace quote before final comma with backtick
            p_line = re.sub('",$', '`,', p_line)
            p_lines.append(p_line)
        # Concatenate the plain text lines with newline as separator
        line = '\n'.join(p_lines)

    transformed_def.append(line)

# Concatenate all type definition lines with newline as separator
type_def = "\n".join(transformed_def)


# Write ts file

ts_file_name = config['output_dir'] + 'texts.ts'

print(f'Writing {ts_file_name}')

ts_file = open(ts_file_name, 'w')
ts_file.write(type_def)
ts_file.close()

Writing ./texts.ts


# Write textblock data to Excel file

import pandas as pd

# Create dataframe from textblock data
df_tb = pd.DataFrame(tb_data, columns=['tb_id', 'title', 'content_html', 'content_text'])

# Write dataframe to Excel file
result_file = config['output_dir'] + 'tb_data.xlsx'

print(f'Writing {result_file}')

df_tb.to_excel(result_file, sheet_name = 'textblocks', index = None)

Writing ./tb_data.xlsx

Extract textblocks from Confluence cloud using a python api¶

Get textblock data¶

Keep your Confluence configuration secret¶

Write textblocks to typescript dictionary definition¶

Write textblocks to Excel file¶