Implement smart generation so we don't re-generate existing HTML - squeeze - A static site generator that can put the toothpaste back in the tube.

commit 5f7e728f828d66710fc338f20b6aba2c3a006246
parent 8bd01a7aa0395916aef93b0e7a2fb5b5728c4df2
Author: St John Karp <contact@stjo.hn>
Date:   Fri, 10 Sep 2021 11:42:49 -0400

Implement smart generation so we don't re-generate existing HTML

Implemented an old feature that I got rid of but deserves to make
a more sophisticated comeback. Only generate HTML files for Markdown
sources that are newer than the last generated RSS. This entails
a couple of additional steps:

- We must delete any HTML files for which a source no longer exists.

- We must exclude HTML from the rsync command so they don't get deleted.

- We should include a -f/--force flag to allow the user to specify
  that they would like to force the generation of all the HTML files.

This last point is a "should" because you can always just delete
the contents of your output folder if you want to regenerate everything,
but then you'd be needlessly copying over all your assets again. Better
to make this part of Squeeze's functionality so it can be done intelligently.

Diffstat:
M README  | 3 +++
M squeeze.sh  | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
M unsqueeze.sh  | 18 +++++++++---------

3 files changed, 69 insertions(+), 32 deletions(-)
diff --git a/README b/README
@@ -64,6 +64,9 @@ this repo.
 
 ## Use
 
+	squeeze.sh [-f|--force] site_path
+	unsqueeze.sh site_path
+
 Generate a static website from Markdown sources:
 
 	./squeeze.sh /home/user/website
diff --git a/squeeze.sh b/squeeze.sh
@@ -2,49 +2,83 @@
 
 # Generate a static website.
 
-# Usage: squeeze.sh SITE_PATH
+# Usage: squeeze.sh [-f|--force] site_path
 
-SITE_PATH=$1
+force=0
 
-OUTPUT_PATH="$SITE_PATH/output"
-SOURCE_PATH="$SITE_PATH/source"
+# Loop through all the arguments and set flags/options.
+while [ "$#" -gt 0 ] ; do
+	case "$1" in
+		-f|--force)
+			force=1
+			shift
+			;;
+		*)
+			site_path="$1"
+			shift
+			;;
+	esac
+done
+
+output_path="$site_path/output"
+source_path="$site_path/source"
+feed_path="$output_path/feeds/rss.xml"
 
 # A space-separated list of all the process IDs we've started.
 proc_ids=""
 # Max number of processes to run at once.
 # There is no way to do `nproc` with only POSIX tools,
 # so the best way to make this portable is with fallbacks.
-MAX_PROCESSES="$(nproc 2>/dev/null ||
+max_processes="$(nproc 2>/dev/null ||
 	sysctl -n hw.ncpu 2>/dev/null ||
 	getconf _NPROCESSORS_ONLN 2>/dev/null)"
 
+# Regenerate everything if the force flag has been used or there is
+# no RSS file, but otherwise only regenerate Markdown files that have
+# changed since the RSS feed was updated.
+rsync_exclude=
+find_test=
+[ "$force" -eq 0 ] &&
+	[ -f "$feed_path" ] &&
+	# Don't delete already generated HTML files.
+	rsync_exclude="--exclude *.html" &&
+	# Only find Markdown files newer than the RSS feed.
+	find_test="-newer $feed_path" &&
+	# Find and delete any HTML files for which a source Markdown
+	# no longer exists.
+	find "$output_path" -type f -name "*.html" |
+		sed "s|$output_path/||" |
+		while IFS= read -r file ; do
+			[ ! -f "$source_path/${file%.html}.md" ] &&
+				echo "deleting orphaned $file" &&
+				rm "$output_path/$file"
+		done
+
 # Copy everything that's not Markdown.
 # This will also create the folder structure for the destination Markdown files.
 rsync --archive --delete --verbose \
-	--exclude "*.md" --exclude "feeds" \
-	"$SOURCE_PATH/" "$OUTPUT_PATH/"
+	--exclude "*.md" --exclude "feeds" $rsync_exclude \
+	"$source_path/" "$output_path/"
 
 # Parse and create all the HTML files.
-find "$SOURCE_PATH" -type f -name "*.md" |
-	sed "s|$SITE_PATH/source/||g" |
+find "$source_path" -type f -name "*.md" $find_test |
+	sed "s|$source_path/||" |
 	while IFS= read -r file ; do
 		echo "$file"
 
 		# Determine if this file has any metadata at the start.
 		# Metadata are in the format Key: value, so it's easy to detect.
-		if head -n 1 "$SOURCE_PATH/$file" | grep -q "^[A-Za-z]*: " ; then
-			HEADERS=1
-		else
-	       		HEADERS=0
-		fi
+		head -n 1 "$source_path/$file" | grep -q "^[A-Za-z]*: " &&
+			headers=1 ||
+	       		headers=0
 
 		# Get everything after the metadata.
-		([ "$HEADERS" -eq 1 ] && sed '1,/^$/d' || cat) < "$SOURCE_PATH/$file" |
+		([ "$headers" -eq 1 ] && sed '1,/^$/d' || cat) < "$source_path/$file" |
 			# Convert Markdown to HTML.
 			markdown_py --extension footnotes --extension md_in_html --extension smarty --quiet --output_format xhtml |
 			# Recombine with the metadata and hand it to Prolog.
-			([ "$HEADERS" -eq 1 ] && sed '/^$/q' "$SOURCE_PATH/$file" ; cat) |
-			swipl --traditional --quiet -l parse_entry.pl -g "consult('$SITE_PATH/site.pl'), generate_entry." |
+			([ "$headers" -eq 1 ] && sed '/^$/q' "$source_path/$file" ; cat) |
+			swipl --traditional --quiet -l parse_entry.pl -g "consult('$site_path/site.pl'), generate_entry." |
 			# Unwrap block-level elements that have erroneously been wrapped in <p> tags.
 			sed 's|<p><details|<details|g' |
 			sed 's|</summary></p>|</summary>|g' |
@@ -53,7 +87,7 @@ find "$SOURCE_PATH" -type f -name "*.md" |
 			sed 's|</figure></p>|</figure>|g' |
 			# Smarten punctuation.
 			smartypants \
-			> "$OUTPUT_PATH/${file%%.md}.html" &
+			> "$output_path/${file%.md}.html" &
 
 		# Add the most recent process ID to the list.
 		proc_ids="$! $proc_ids"
@@ -61,7 +95,7 @@ find "$SOURCE_PATH" -type f -name "*.md" |
 		# or equal to the max processes. We have to subtract one
 		# because the `ps` command always outputs a header that we
 		# don't want to count.
-		while [ "$(expr "$(ps -p "${proc_ids%% }" | wc -l)" - 1)" -ge "$MAX_PROCESSES" ] ; do
+		while [ "$(expr "$(ps -p "${proc_ids%% }" | wc -l)" - 1)" -ge "$max_processes" ] ; do
 			true
 		done
 	done
@@ -70,9 +104,9 @@ find "$SOURCE_PATH" -type f -name "*.md" |
 wait
 
 # Generate the RSS feed.
-mkdir -p "$OUTPUT_PATH/feeds"
+mkdir -p "${feed_path%/*}"
 # Grep the date of each article.
-find "$OUTPUT_PATH" -type f -name "*.html" \
+find "$output_path" -type f -name "*.html" \
 	-exec grep "id=\"article-date\"" {} + |
 	# Sort articles by date (skipping the first field).
 	sort -k 2 |
@@ -81,5 +115,5 @@ find "$OUTPUT_PATH" -type f -name "*.html" \
 	# Reformat to just the file names.
 	cut -f 1 -d : |
 	# Parse the articles and generate the RSS.
-	swipl --traditional --quiet -l generate_rss.pl -g "consult('$SITE_PATH/site.pl'), generate_rss(\"$(date '+%a, %d %b %Y %T %Z')\")." \
-	> "$OUTPUT_PATH/feeds/rss.xml"
+	swipl --traditional --quiet -l generate_rss.pl -g "consult('$site_path/site.pl'), generate_rss(\"$(date '+%a, %d %b %Y %T %Z')\")." \
+	> "$feed_path"
diff --git a/unsqueeze.sh b/unsqueeze.sh
@@ -2,27 +2,27 @@
 
 # Ungenerate a static website.
 
-# Usage: unsqueeze.sh SITE_PATH
+# Usage: unsqueeze.sh site_path
 
-export SITE_PATH=$1
+export site_path=$1
 
-export OUTPUT_PATH="$SITE_PATH/output"
-export SOURCE_PATH="$SITE_PATH/source"
+export output_path="$site_path/output"
+export source_path="$site_path/source"
 
 # Copy everything that's not HTML.
 # Excludes the RSS folder, which we create ourselves upon generation.
 # This will also create the folder structure for the destination Markdown files.
 rsync --archive --delete --verbose \
        --exclude "*.html" --exclude "feeds" \
-       "$OUTPUT_PATH/" "$SOURCE_PATH/"
+       "$output_path/" "$source_path/"
 
 # Parse and create all the Markdown files.
-find "$OUTPUT_PATH" -type f -name "*.html" |
-	sed "s|$SITE_PATH/output/||g" |
+find "$output_path" -type f -name "*.html" |
+	sed "s|$output_path/||" |
 	while IFS= read -r file ; do
 		echo "$file"
 	
-		swipl --traditional --quiet -l parse_entry.pl -g "consult('$SITE_PATH/site.pl'), parse_entry('$SITE_PATH/output/$file')." |
+		swipl --traditional --quiet -l parse_entry.pl -g "consult('$site_path/site.pl'), parse_entry('$output_path/$file')." |
 			# Unsmarten the punctuation.
 			sed 's/&nbsp;/ /g' |
 			# Replace single quotes.
@@ -37,7 +37,7 @@ find "$OUTPUT_PATH" -type f -name "*.html" |
 			sed 's/&rdquo;/"/g' |
 			sed 's/&ldquo;/"/g' |
 			sed 's/&quot;/"/g' \
-			> "$SITE_PATH/source/${file%%.html}.md" &
+			> "$source_path/${file%.html}.md" &
 	done
 
 # Wait until all jobs have completed.

	squeeze A static site generator that can put the toothpaste back in the tube.
	git clone https://git.stjo.hn/squeeze
	Log \| Files \| Refs \| README \| LICENSE

M	README	\|	3	+++
M	squeeze.sh	\|	80	++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
M	unsqueeze.sh	\|	18	+++++++++---------