The purpose of this script is to export textblocks from Confluence in order to provide them as building blocks for online application forms.
Textblocks are included into application forms by using a macro that takes the exported textblocks as input.
This script assumes that its configuration is specified in a config.json file with the following content:
{
"domain": "your_domain",
"username": "your_username",
"api_token": "your_api_token",
"output_dir": "your_output_directory",
"li_symbol": "your_symbol_for_list_elements"
}
So you can avoid to specify your confidential Confluence configuration in this script.
Please make sure that your config.json is kept secret, e.g. by adding it to your .gitignore:
# Inside .gitignore
config.json
Apart from the Confluence login credentials config.json may also contain other configuration parameters, such as
# Read config parameters
import json
with open('config.json', 'r') as f:
config = json.load(f)
# Connect to Confluence cloud
from atlassian import Confluence
confluence = Confluence(
url = config['domain'],
username = config['username'],
password = config['api_token'],
cloud = True)
# Read textblock content from Confluence
import bs4
from bs4 import BeautifulSoup
import textwrap as tr
# Optional: improve readability of formatted html by customizing identation (default indent = 1)
formatter = bs4.formatter.HTMLFormatter(indent = 4)
# Html elements that trigger a newline when converted to plain text
elem_newline = ['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5']
# For plain text: define prefix used for list elements
li_prefix = ""
if config['li_symbol'] != "":
li_prefix = config['li_symbol'] + " "
inside_li = False
prefix_used = False
# Extract textblock titles and contents
tb_data = []
tb_pages = confluence.get_all_pages_by_label('textblock', start=0, limit=50)
print(f'Extracting {len(tb_pages)} textblocks from Confluence')
for p in tb_pages:
tb_id = ''
title = ''
content = ''
content_text = ''
old_string = ''
page = confluence.get_page_by_id(p['id'], expand = 'body.storage', status = None, version = None)
# Process page title
title = page['title']
# Split title at first occurence of space
tb_id, title = title.split(' ', 1)
# Get textblock content
page_body = page['body']['storage']['value']
soup = BeautifulSoup(page_body, 'html.parser')
# The textblock content should be in the last ac:structured-macro of a page (index -1)
content = soup('ac:structured-macro')[-1]('ac:rich-text-body')[0]
# Prepare textblock content for output
# Replace 'ac:rich-text-body' tag name with 'div'
content.name = 'div'
# Add attributes to div tag
content['tb_id'] = tb_id
content['title'] = title
# Convert content to formatted html
content_html = str(content)
# Optional: prettify html content
#content_html = content.prettify(formatter = formatter)
# Convert content to plain text
# The last element has to be treated differently: no trailing newline
elem_max = len(list(content.descendants)) - 1
elem_counter = 0
# Iterate recursively over content elements
for elem in content.descendants:
elem_counter += 1
# Suppress duplicate strings
if elem.string and elem.string == old_string:
old_string = elem.string
continue # proceed to next iteration
# Process lists
if elem.name in ['ul', 'ol']:
for li in elem.children:
content_text += li_prefix
# Add (recursive) content of li element
for li_string in li.strings:
content_text += li_string
if elem_counter < elem_max:
content_text += "\n"
# Process everything else
if (elem.find_parent('ul') is None and
elem.find_parent('ol') is None):
# elem is not part of a list
if elem.string:
content_text += elem.string
if elem.name in elem_newline and elem_counter < elem_max:
content_text += "\n"
old_string = elem.string
tb_data.append([tb_id, title, content_html, content_text])
Extracting 7 textblocks from Confluence
# Create ts file from textblock data
# Comment line
comment_line = "// GENERATED CODE: DO NOT CHANGE MANUALLY!\n\n"
# Beginning of ts dictionary definition
type_def = "export const texts: { [key: string]: string } =\n"
# Create a dictionary with key = tb_id and value = content_html via list comprehension
dict_tb_html = {tb[0]: tb[2] for tb in tb_data}
# Create a dictionary with key = tb_id extended with "_plain" and value = content_text via list comprehension
dict_tb_plain = {tb[0] + '_plain': tb[3] for tb in tb_data}
# Merge the two dictionaries
dict_tb = {**dict_tb_html, **dict_tb_plain}
# Serialize dictionary content
dict_content = json.dumps(
dict_tb,
sort_keys = True,
indent = 0,
separators = (',\n', ':\n'))
# Build complete content of type definition file
type_def = comment_line + type_def + dict_content
import re
# Convert plain text strings into multiline string
transformed_def = []
counter = 0
plain_text_counter = 0
type_def_lines = type_def.split("\n")
for line in type_def_lines:
plain_lines = []
counter += 1
if "_plain" in line:
# The current line contains the plain text key
# So the next line will contain the plain text string
plain_text_counter = counter + 1
if plain_text_counter == counter:
# Convert plain text string into multiline string
plain_lines = line.split("\\n")
p_lines = []
for p_line in plain_lines:
# Replace leading and trailing quote with backtick
p_line = re.sub('^"|"$', '`', p_line)
# Replace quote before final comma with backtick
p_line = re.sub('",$', '`,', p_line)
p_lines.append(p_line)
# Concatenate the plain text lines with newline as separator
line = '\n'.join(p_lines)
transformed_def.append(line)
# Concatenate all type definition lines with newline as separator
type_def = "\n".join(transformed_def)
# Write ts file
ts_file_name = config['output_dir'] + 'texts.ts'
print(f'Writing {ts_file_name}')
ts_file = open(ts_file_name, 'w')
ts_file.write(type_def)
ts_file.close()
Writing ./texts.ts
# Write textblock data to Excel file
import pandas as pd
# Create dataframe from textblock data
df_tb = pd.DataFrame(tb_data, columns=['tb_id', 'title', 'content_html', 'content_text'])
# Write dataframe to Excel file
result_file = config['output_dir'] + 'tb_data.xlsx'
print(f'Writing {result_file}')
df_tb.to_excel(result_file, sheet_name = 'textblocks', index = None)
Writing ./tb_data.xlsx