diff --git a/content/posts/automatic-migration-of-ghost-content/featured.png b/content/posts/automatic-migration-of-ghost-content/featured.png new file mode 100644 index 0000000..c627df8 --- /dev/null +++ b/content/posts/automatic-migration-of-ghost-content/featured.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53603ffce965c98e0304c573b23425a200fb5dab98cd24d8d93fbee30c4d6f66 +size 479390 diff --git a/content/posts/automatic-migration-of-ghost-content/index.md b/content/posts/automatic-migration-of-ghost-content/index.md new file mode 100644 index 0000000..7528b96 --- /dev/null +++ b/content/posts/automatic-migration-of-ghost-content/index.md @@ -0,0 +1,286 @@ +--- +title: "Automatic migration of Ghost content" +date: 2024-03-10 +slug: "automatic-migration-of-ghost-content" +tags: ["hugo", "python"] +type: "programming" +series: ["Migration"] +series_order: 2 +--- + +In this second part of the migration of my Ghost blog to hugo, I'm going to show you how I automatically migrated my 38 CTF writeups. + +I started by exporting my Ghost blog to JSON. I then studied the JSON structure to find out where the posts were located. I determined the following structure: +```json +"db": [ + { + "data": { + [...] + "posts": [ + { + "id": "620c06d22ddec50001a0d647", + "uuid": "30679c97-4b80-41ac-aae0-91ea74ee85ca", + "title": "Writeup - Pandora (HTB)", + "slug": "writeup-pandora-htb", + "mobiledoc": "...", + "html": "...", + "comment_id": "620c06d22ddec50001a0d647", + "plaintext": "...", + "feature_image": "__GHOST_URL__/content/images/2022/03/Pandora.png", + "featured": 0, + "type": "post", + "status": "published", + [...] + }, + [...] + ] + [...] + } + } +] +``` + +In order to iterate over the different articles, all I need to do is make the following loop: +```python +import json + +with open("export.json") as file: + export = json.load(file) + +for i in export["db"][0]["data"]["posts"]: + if "writeup" in i["slug"] and i["status"] == "published": + # [...] + +``` +Note that I added a condition to keep only articles containing `writeup` in the slug and with a `published` status. + +Now that I have the list of different items, I can convert them for Hugo. + +## Directory structure creation +The first step in converting articles is to create the directory structure. Hugo uses the following folder structure for articles: +``` +folder/ + article_1/ + img/ + featured.png + index.md + article_1/ + [...] + [...] +``` + +In the for loop created earlier, I use the following line to create the folder for each item: +```python +import os + +for i in export["db"][0]["data"]["posts"]: + [...] + os.makedirs(os.path.join("output/", i["slug"], "img"), exist_ok=True) + [...] +``` + +## Article conversion +I'm now going to create and fill in the `index.md`. To do this, I'll use the `html` field in the Ghost export and a python library that converts HTML to Markdown: +{{< github repo="matthewwithanm/python-markdownify" >}} + +Still in the same for loop, I open the `index.md` file in the folder corresponding to the article. I write the header and then use the `md()` function to convert HTML to Markdown: +```python +from markdownify import MarkdownConverter + +for i in export["db"][0]["data"]["posts"]: + [...] + header = f"""--- + title: "{i["title"]}" + date: {i["published_at"].split("T")[0]} + slug: "{i["slug"]}" + type: "writeup-ctf" + --- \n\n""" + + with open(os.path.join("output/", i["slug"], "index.md"), "w") as index: + index.write(header) + content = md( + i["html"], heading_style="ATX", code_language="bash", bullets="-" + ) + index.write(content) + [...] +``` + +The markdownify library has a large number of options, and I've used the following: +- `heading_style` - Defines how headings should be converted. +- `code_language` - Defines the language that should be assumed for all `
` sections +- `bullets` - An iterable (string, list, or tuple) of bullet styles to be used. + +## Download images +To complete the import, one important element is missing: the images. Since I'll be downloading files from several places in the code, I've created a function that I'll be able to reuse: +```python +from pathlib import Path +import requests + +def download_file(url: str, dst: Path) -> None: + data = requests.get(url).content + with open(dst, "wb") as img: + img.write(data) +``` +This function takes a URL and a destion for the final file. + +### Featured image +To download the featured image, I first create a variable containing either `None` if there is no feature_image for this item, or the image address. Since the export contains `__GHOST_URL__` instead of my blog link, I use the `replace()` function to make the change. If there's a feature image, I use the previously created `download_file()` function to download it: +```python +import os + +for i in export["db"][0]["data"]["posts"]: + [...] + feature_link = ( + i["feature_image"].replace("__GHOST_URL__", https://blog.d3vyce.fr) + if i["feature_image"] + else None + ) + + if feature_link: + download_file( + feature_link, + os.path.join("output/", i["slug"], "featured.png"), + ) + [...] +``` + +### Article image +For images in articles, it's a little more complex. I have to find the images, download them and rename them with an incrementing number. Finaly I can integrate the image tags into the final Markdown. + +To do so, I use a custom class which is called in the `md()` function. As for featured_image, this class lets me replace the `__GHOST_URL__`, then download the images into a temporary folder. In the `download_img()` function, I assign a name to the image according to the files already in the folder, so that the names increment (image-1.png, image-2.png, ...). + +Finally, after making all the images in the article, I move all the files in the temporary folder to the final folder with `shutil.move()`. + +```python +import os +import shutil +import re +from markdownify import MarkdownConverter + +def download_img(url: str) -> Path: + files = os.listdir("tmp/") + id = [int(re.search(r"\-(.*?)\.", i).group(1)) for i in files] + file_name = "image-" + str(max(id) + 1) + ".png" if files else "image-1.png" + download_file(url, os.path.join("tmp/", file_name)) + return os.path.join("img/", file_name.replace(".png", ".webp")) + + +class ImageBlockConverter(MarkdownConverter): + def convert_img(self, el, text, convert_as_inline): + data = ( + super() + .convert_img(el, text, convert_as_inline) + .replace("__GHOST_URL__", GHOST_URL) + + "\n\n" + ) + if GHOST_URL in data: + img_url = re.search(r"\((.*?)\)", data).group(1) + return data.replace(img_url, download_img(img_url)) + + +def md(html, **options): + return ImageBlockConverter(**options).convert(html) + +for i in export["db"][0]["data"]["posts"]: + [...] + for file in os.listdir("tmp/"): + shutil.move( + os.path.join("tmp/", file), + os.path.join("output/", i["slug"], "img", file), + ) + [...] +``` + +## Conclusion +Thanks to this script, 90% of the work was done, even if I went back over the various articles to correct 2/3 errors and check that no element was missing. For those interested, here's the full code. It's highly customized and requires a few modifications if you want to use it. + +### Code final + +```python +import json +import os +import shutil +import re +from pathlib import Path + +import requests +from markdownify import MarkdownConverter + +GHOST_URL = "https://blog.d3vyce.fr" +EXPORT_FILE = "export.json" +OUTPUT_DIR = "output/" +TMP_DIR = "tmp/" + + +def download_file(url: str, dst: Path) -> None: + data = requests.get(url).content + with open(dst, "wb") as img: + img.write(data) + + +def download_img(url: str) -> Path: + files = os.listdir(TMP_DIR) + id = [int(re.search(r"\-(.*?)\.", i).group(1)) for i in files] + file_name = "image-" + str(max(id) + 1) + ".png" if files else "image-1.png" + download_file(url, os.path.join(TMP_DIR, file_name)) + return os.path.join("img/", file_name.replace(".png", ".webp")) + + +class ImageBlockConverter(MarkdownConverter): + def convert_img(self, el, text, convert_as_inline): + data = ( + super() + .convert_img(el, text, convert_as_inline) + .replace("__GHOST_URL__", GHOST_URL) + + "\n\n" + ) + if GHOST_URL in data: + img_url = re.search(r"\((.*?)\)", data).group(1) + return data.replace(img_url, download_img(img_url)) + + +def md(html, **options): + return ImageBlockConverter(**options).convert(html) + + +def main(): + with open(EXPORT_FILE) as file: + export = json.load(file) + + for i in export["db"][0]["data"]["posts"]: + if "writeup" in i["slug"] and i["status"] == "published": + feature_link = ( + i["feature_image"].replace("__GHOST_URL__", GHOST_URL) + if i["feature_image"] + else None + ) + + header = f"""--- + title: "{i["title"]}" + date: {i["published_at"].split("T")[0]} + slug: "{i["slug"]}" + type: "writeup-ctf" + --- \n\n""" + + # Create Directorys + os.makedirs(os.path.join(OUTPUT_DIR, i["slug"], "img"), exist_ok=True) + os.makedirs(TMP_DIR, exist_ok=True) + # Create index.md + with open(os.path.join(OUTPUT_DIR, i["slug"], "index.md"), "w") as index: + index.write(header) + content = md( + i["html"], heading_style="ATX", code_language="bash", bullets="-" + ) + index.write(content) + for file in os.listdir(TMP_DIR): + shutil.move( + os.path.join(TMP_DIR, file), + os.path.join(OUTPUT_DIR, i["slug"], "img", file), + ) + # Download featured img + if feature_link: + download_file( + feature_link, os.path.join(OUTPUT_DIR, i["slug"], "featured.png") + ) + +``` diff --git a/content/posts/migrate-from-ghost-to-hugo/index.md b/content/posts/migrate-from-ghost-to-hugo/index.md index e29d416..d05158e 100644 --- a/content/posts/migrate-from-ghost-to-hugo/index.md +++ b/content/posts/migrate-from-ghost-to-hugo/index.md @@ -4,6 +4,8 @@ date: 2024-02-24 slug: "migrate-from-ghost-to-hugo" tags: ["CI/CD", "docker", "git", "hugo"] type: "programming" +series: ["Migration"] +series_order: 1 --- ## Current solution