reproducible_maintenance.sh 26.4 KB
Newer Older
1
#!/bin/bash
2
# vim: set noexpandtab:
3

4
# Copyright 2014-2019 Holger Levsen <holger@layer-acht.org>
5
#         © 2015-2018 Mattia Rizzolo <mattia@debian.org>
6 7
# released under the GPLv=2

8
DEBUG=false
9 10 11 12 13 14
. /srv/jenkins/bin/common-functions.sh
common_init "$@"

# common code defining db access
. /srv/jenkins/bin/reproducible_common.sh

15
DIRTY=false
16
REP_RESULTS=/srv/reproducible-results
17

18 19 20

# query reproducible database, print output
query_to_print() {
21
	printf "$(psql -c "$@")"
22 23
}

24
# backup db
25
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
26
	echo "$(date -u) - backup db and update public copy."
27 28 29 30 31 32
	# prepare backup
	mkdir -p $REP_RESULTS/backup

	# keep 30 days and the 1st of the month
	DAY=(date -d "30 day ago" '+%d')
	DATE=$(date -d "30 day ago" '+%Y-%m-%d')
33 34 35
	BACKUPFILE="$REP_RESULTS/backup/reproducible_$DATE.sql.xz"
	if [ "$DAY" != "01" ] &&  [ -f "$BACKUPFILE" ] ; then
		rm -f "$BACKUPFILE"
36 37
	fi

38
	# Make a daily backup of database
39
	DATE=$(date '+%Y-%m-%d')
40 41 42
	BACKUPFILE="$REP_RESULTS/backup/reproducible_$DATE.sql"
	if [ ! -f $BACKUPFILE.xz ] ; then
		# make the backup
43
		DATE=$(date '+%Y-%m-%d')
44
		pg_dump -x -O $PGDATABASE > "$BACKUPFILE"
45
		xz "$BACKUPFILE"
46

47 48
		# make the backup public
		ln -s -f "$BACKUPFILE.xz" $BASE/reproducible.sql.xz
49 50

		# recreate documentation of database
51
		postgresql_autodoc -d $PGDATABASE -t html -f "$BASE/reproducibledb"
52
	fi
53 54
fi

55 56 57
#
# we fail hard
#
58
set -e
59

60 61 62 63 64 65 66 67 68 69 70
#
# find too large files in /var/log
#
echo "$(date -u) - Looking for too large files in /var/log/"
TOOBIG=$(find /var/log -size +8G -exec ls -lah {} \; 2>/dev/null || true)
if [ ! -z "$TOOBIG" ] ; then
	echo
	echo "$(date -u) - Warning: too large files found in /var/log:"
	echo "$TOOBIG"
	echo
	DIRTY=true
71
	if [ -n "$(find /var/log -size +32G 2> >(grep -v 'Permission denied'))" ] ; then
72 73 74 75 76
		echo "$(date -u) - Error, more than 32gb is just wrong..."
		exit 1
	fi
fi

77 78 79
#
# delete old temp directories
#
80
echo "$(date -u) - Deleting temp directories in $REP_RESULTS/rbuild-debian, older than 3 days."
81
OLDSTUFF=$(find $REP_RESULTS/rbuild-debian -maxdepth 1 -type d -mtime +2 -name "tmp.*" -exec ls -lad {} \; 2>/dev/null|| true)
82 83 84
if [ ! -z "$OLDSTUFF" ] ; then
	echo
	echo "Old temp directories found in $REP_RESULTS/rbuild-debian"
85
	find $REP_RESULTS/rbuild-debian -maxdepth 1 -type d -mtime +2 -name "tmp.*" -exec rm -rv --one-file-system {} \; || true
86 87 88 89 90
	echo "These old directories have been deleted."
	echo
	DIRTY=true
fi

91
#
92
# delete old temp directories in /tmp (probably only useful on osuosl171+172)
93 94
#
echo "$(date -u) - Deleting temporary directories in /tmp, older than 3 days."
95
OLDSTUFF=$(find /tmp -maxdepth 1 -type d -mtime +2 -regextype egrep -regex '/tmp/(tmp.*|Test.*|usession-release.*|.*test.*)' -exec ls -lad {} \; || true)
96 97 98
if [ ! -z "$OLDSTUFF" ] ; then
	echo
	echo "Old temp directories found in /tmp"
99
	find /tmp -maxdepth 1 -type d -mtime +2 -regextype egrep -regex '/tmp/(tmp.*|Test.*|usession-release.*|.*test.*)' -exec sudo rm -rv --one-file-system {} \; || true
100 101 102 103 104
	echo "These old directories have been deleted."
	echo
	DIRTY=true
fi

105 106 107 108 109 110 111 112 113 114
#
# delete old pbuilder build directories
#
if [ -d /srv/workspace/pbuilder/ ] ; then
	echo "$(date -u) - Deleting pbuilder build directories, older than 3 days."
	OLDSTUFF=$(find /srv/workspace/pbuilder/ -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +2 -exec ls -lad {} \; || true)
	if [ ! -z "$OLDSTUFF" ] ; then
		echo
		echo "Old pbuilder build directories found in /srv/workspace/pbuilder/"
		echo -n "$OLDSTUFF"
115
		( find /srv/workspace/pbuilder/ -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +2 -exec sudo rm -rf --one-file-system {} \; ) || true
116 117 118 119 120
		echo
		DIRTY=true
	fi
fi

121 122 123 124 125
#
# delete old chroot-installation directories (not related to reproducible builds)
#
if [ -d /srv/workspace/chroots/ ] ; then
	echo "$(date -u) - Deleting chroots build directories, older than 7 days."
126
	OLDSTUFF=$(find /srv/workspace/chroots/ -maxdepth 2 -name 'chroot-installation*' -type d -mtime +6 -exec ls -lad {} \; || true)
127 128 129 130
	if [ ! -z "$OLDSTUFF" ] ; then
		echo
		echo "Old chroot-installation directories found in /srv/workspace/chroots/"
		echo -n "$OLDSTUFF"
131
		( find /srv/workspace/chroots/ -maxdepth 2 -name 'chroot-installation*' -type d -mtime +6 -exec sudo rm -rf --one-file-system {} \; ) || true
132 133 134 135 136
		echo
		DIRTY=true
	fi
fi

137 138 139 140 141 142
#
# check for working proxy
#
echo "$(date -u) - testing whether the proxy works..."
curl http://www.debian.org > /dev/null
if [ $? -ne 0 ] ; then
143 144
	echo "Error: curl http://www.debian.org failed, probably the proxy is down for $HOSTNAME"
	exit 1
145 146
fi

147 148 149 150 151 152
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
	#
	# find nodes with problems and temporarily turn them offline
	#
	echo "$(date -u) - Looking for unhealthy nodes."
	cd ~/jobs
153
	DUMMY_FILE=$(mktemp --tmpdir=$TMPDIR maintenance-XXXXXXX)
154
	SICK=""
155 156
	for i in reproducible_node_health_check_* reproducible_maintenance_* ; do
		case $i in
157 158 159 160
			reproducible_node_health_check_amd64_jenkins|reproducible_maintenance_amd64_jenkins)
				echo "Skipping $i..."
				continue
				;;
161 162 163
			reproducible_node_health_check_*)
				NODE_ALIAS=$(echo $i | cut -d '_' -f6)
				NODE_ARCH=$(echo $i | cut -d '_' -f5)
164 165
				FORCE_DATE=$(date -u -d "2 hour ago" '+%Y-%m-%d %H:%M')
				MAXDIFF=8
166 167 168 169 170 171 172 173 174
				;;
			reproducible_maintenance_*)
				NODE_ALIAS=$(echo $i | cut -d '_' -f4)
				NODE_ARCH=$(echo $i | cut -d '_' -f3)
				FORCE_DATE=$(date -u -d "5 hour ago" '+%Y-%m-%d %H:%M')
				MAXDIFF=2
				;;
		esac
		touch -d "$FORCE_DATE" $DUMMY_FILE
175
		case $NODE_ARCH in
176 177 178 179 180
			amd64)
				case "$NODE_ALIAS" in
					(profitbricks*) NODE="profitbricks-build${NODE_ALIAS#profitbricks}-amd64.debian.net" ;;
					(osuosl*) NODE="osuosl-build${NODE_ALIAS#osuosl}-amd64.debian.net" ;;
				esac ;;
181 182 183 184
			i386)	NODE="profitbricks-build${NODE_ALIAS#profitbricks}-i386.debian.net" ;;
			arm64)	NODE="codethink-sled${NODE_ALIAS#codethink}-arm64.debian.net" ;;
			armhf)	NODE="${NODE_ALIAS}-armhf-rb.debian.net" ;;
		esac
185
		case "$NODE" in
186
			profitbricks-build9-amd64.debian.net|profitbricks-build10-amd64.debian.net)
187 188
				# pb9 and pb10 are not used for r-b and sometimes are too busy
				# to run healthcheck / maintenance jobs
189 190 191 192
				echo "Skipping ${NODE}..."
				continue
				;;
		esac
193 194
		cd $i/builds
		LAST=$(ls -rt1 | tail -1)
195
		GOOD=$(basename $(readlink -f lastSuccessfulBuild))
Holger Levsen's avatar
Holger Levsen committed
196
		if [ "$LAST" = "$GOOD" ] ; then
197 198 199 200 201 202
			DIFF=0
		else
			let DIFF=$LAST-$GOOD || DIFF=-1
		fi
		if [ $DIFF -eq -1 ] ; then
			echo "Problems analysing $i build logs, ignoring $NODE."
203
		# either the diff is greater than $MAXDIFF (=the last $MAXDIFF job runs failed)
204
		# or the last successful run is older than an hour (=a job is still running/hanging)
205
		elif [ $DIFF -gt $MAXDIFF ] || [ $LAST -ot $DUMMY_FILE ] ; then
206
			echo -n "$i job has issues since more than an hour"
207 208 209 210 211
			if grep -q $NODE ~/offline_nodes >/dev/null 2>&1 ; then
				echo " and $NODE already marked as offline, good."
			else
				echo $NODE >> ~/offline_nodes
				echo " so $NODE has (temporarily) been marked as offline now."
212
				SICK="$SICK $NODE"
213 214 215 216 217 218
			fi
		else
			echo "$NODE is doing fine, good."
		fi
		cd ../..
	done
219
	if [ -n "$SICK" ] ; then
220
		SICK=$(echo "$SICK" | sed 's#.debian.net##g' | sed 's#-rb##g' | sed 's# ##' )
221
		if echo "$SICK" | grep -q ' ' 2>/dev/null ; then
222
			SICK=$(echo "$SICK" | sed 's# # and #g')
223
			MESSAGE="$SICK have health problems and have temporarily been marked as offline."
224
		else
225
			MESSAGE="$SICK has health problems and has temporarily been marked as offline."
226
		fi
227
		irc_message reproducible-builds "$MESSAGE To make this permanent, edit jenkins-home/offline_nodes in git."
228
	fi
229
	rm -f $DUMMY_FILE
230 231
fi

232
echo "$(date -u) - updating the chdists, schroots and pbuilder now..."
Holger Levsen's avatar
Holger Levsen committed
233
# use host architecture (only)
234
ARCH=$(dpkg --print-architecture)
Holger Levsen's avatar
Holger Levsen committed
235 236 237 238
# use host apt proxy configuration for pbuilder
if [ ! -z "$http_proxy" ] ; then
	pbuilder_http_proxy="--http-proxy $http_proxy"
fi
239
for s in $SUITES ; do
240 241 242 243
	if [ "${HOSTNAME:0:6}" = "osuosl" ] ; then
		# osuosl nodes are not used to do Debian rebuilds
		continue
	fi
244 245 246 247 248 249 250 251 252 253 254
	#
	# chdist update
	#
	distname="$s-$ARCH"
	echo "$(date -u) - updating the $s/$ARCH chdist now."
	if [ ! -d "$CHPATH/$distname" ]; then
		echo "$(date -u) - chdist not existing, creating one now..."
		if ! chdist --data-dir="$CHPATH" --arch="$ARCH" create "$distname" "$MIRROR" "$s" main ; then
			echo "Error: failed to create the $s/$ARCH chdist."
			exit 1
		fi
255
		. /srv/jenkins/bin/jenkins_node_definitions.sh
Holger Levsen's avatar
Holger Levsen committed
256
		get_node_information "$HOSTNAME"
257 258 259 260
		if "$NODE_RUN_IN_THE_FUTURE" ; then
			echo "This node is reported to run in the future, configuring APT to ignore the Release file expiration..."
			echo 'Acquire::Check-Valid-Until "false";' > "$CHPATH/$distname/etc/apt/apt.conf.d/398future"
		fi
261 262 263 264 265
	fi
	if ! chdist --data-dir="$CHPATH" apt-get "$distname" update ; then
		echo "Warning: failed to update the $s/$ARCH chdist."
		DIRTY=true
	fi
266 267 268
	#
	# schroot update
	#
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
	#echo "$(date -u) - updating the $s/$ARCH schroot now."
	#for i in 1 2 3 4 ; do
	#	[ ! -d $SCHROOT_BASE/reproducible-$s ] || schroot --directory /root -u root -c source:jenkins-reproducible-$s -- apt-get update
	#	RESULT=$?
	#	if [ $RESULT -eq 1 ] ; then
	#		# sleep 61-120 secs
	#		echo "Sleeping some time... (to workaround network problems like 'Hash Sum mismatch'...)"
	#		/bin/sleep $(echo "scale=1 ; ($(shuf -i 1-600 -n 1)/10)+60" | bc )
	#		echo "$(date -u) - Retrying to update the $s/$ARCH schroot."
	#	elif [ $RESULT -eq 0 ] ; then
	#		break
	#	fi
	#done
	#if [ $RESULT -eq 1 ] ; then
	#	echo "Warning: failed to update the $s/$ARCH schroot."
	#	DIRTY=true
	#fi
286 287 288
	#
	# pbuilder update
	#
289
	# pbuilder aint used on jenkins anymore
290
	if [ "$HOSTNAME" = "$MAINNODE" ] ; then
291 292 293 294
		continue
	else
		echo "$(date -u) - updating pbuilder for $s/$ARCH now."
	fi
295
	for i in 1 2 3 4 ; do
296
		[ ! -f /var/cache/pbuilder/$s-reproducible-base.tgz ] || sudo pbuilder --update $pbuilder_http_proxy --basetgz /var/cache/pbuilder/$s-reproducible-base.tgz
297 298
		RESULT=$?
		if [ $RESULT -eq 1 ] ; then
299
			# sleep 61-120 secs
300
			echo "Sleeping some time... (to workaround network problems like 'Hash Sum mismatch'...)"
301
			/bin/sleep $(echo "scale=1 ; ($(shuf -i 1-600 -n 1)/10)+60" | bc )
302 303
			echo "$(date -u) - Retrying to update pbuilder for $s/$ARCH."
		elif [ $RESULT -eq 0 ] ; then
304
			break
305 306 307 308 309 310 311 312 313
		fi
	done
	if [ $RESULT -eq 1 ] ; then
		echo "Warning: failed to update pbuilder for $s/$ARCH."
		DIRTY=true
	fi
done
set -e

314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
# for alpine
set +e
case $HOSTNAME in
	osuosl-build169*|osuosl-build170*|jenkins)
		echo "$(date -u) - updating alpine schroot now."
		schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk update
		RESULT=$?
		if [ $RESULT -eq 1 ] ; then
			echo "Warning: failed to update alpine schroot."
			DIRTY=true
		else
			echo "$(date -u) - updating alpine schroot done."
		fi
		;;
	*)	;;
esac
set -e

332
# for Arch Linux
333
set +e
334
case $HOSTNAME in
335
	osuosl-build169*|osuosl-build170*|jenkins)
336 337 338 339
		echo "$(date -u) - updating Arch Linux schroot now."
		schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- pacman -Syu --noconfirm
		RESULT=$?
		if [ $RESULT -eq 1 ] ; then
340
			echo "Warning: failed to update Arch Linux schroot."
341 342
			echo "Let's see if /var/lib/pacman/db.lck exists in the schroot."
			schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- ls /var/lib/pacman/db.lck
343
			DIRTY=true
344 345 346 347 348 349
		else
			echo "$(date -u) - updating Arch Linux schroot done."
		fi
		;;
	*)	;;
esac
350
set -e
351

352
# delete build services logfiles
353 354
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
	if [ -d /var/lib/jenkins/userContent/reproducible/debian/build_service/ ] ; then
355 356
		echo "$(date -u) - Deleting logfiles from build services directories, older than a day."
		OLDSTUFF=$(find /var/lib/jenkins/userContent/reproducible/debian/build_service/ -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +0 -exec ls -lad {} \; || true)
357 358 359 360
		if [ ! -z "$OLDSTUFF" ] ; then
			echo
			echo "Old logfiles cleaned in /var/lib/jenkins/userContent/reproducible/debian/build_service/"
			echo -n "$OLDSTUFF"
361 362 363 364
			# we make sure to actually only delete console.log.gz older than a day
			# other stuff we only delete after two days (in case a build is running more than 24h...)
			find /var/lib/jenkins/userContent/reproducible/debian/build_service/ -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +0 -name console.log.gz -exec rm -rf --one-file-system {} \; || true
			find /var/lib/jenkins/userContent/reproducible/debian/build_service/ -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +1 -exec rm -rf --one-file-system {} \; || true
365 366
			echo
		fi
367 368 369
	fi
fi

370
# remove too old schroot sessions
371
echo "$(date -u) - Removing schroot sessions older than 3 days."
372
dir=/var/lib/schroot/unpack/
373
OLDSTUFF=$(find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +2 -exec ls -lad {} \;)
374 375
if [ ! -z "$OLDSTUFF" ]; then
	echo
376
	echo "schroot sessions older than 3 days found, which will be deleted:"
377 378 379 380 381 382
	echo "$OLDSTUFF"
	echo
	for s in $(find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +2 -print0 | xargs -0 -r basename -a); do
		echo "$(date -u) - removing schroot session $s..."
		schroot -c "$s" --end-session
	done
383
	OLDSTUFF=$(find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +2 -exec ls -lad {} \;)
384 385
	if [ ! -z "$OLDSTUFF" ]; then
		echo
386
		echo "Warning: Tried, but failed to delete these:"
387 388 389 390 391 392
		echo "$OLDSTUFF"
		echo "Manual cleanup needed"
	fi
	echo
	DIRTY=true
fi
393

394
# find old schroots
395
echo "$(date -u) - Removing schroots older than 3 days."
396
regex="/schroots/(reproducible-.+-[0-9]{1,5}|schroot-install-.+)"
397
OLDSTUFF=$(find /schroots/ -maxdepth 1 -type d -regextype posix-extended -regex "$regex" -mtime +2 -exec ls -lad {} \; || true)
398 399
if [ ! -z "$OLDSTUFF" ] ; then
	echo
400
	echo "schroots older than 3 days found in /schroots, which will be deleted:"
401
	find /schroots/ -maxdepth 1 -type d -regextype posix-extended -regex "$regex" -mtime +2 -exec sudo rm -rf --one-file-system {} \; || true
402
	echo "$OLDSTUFF"
403
	OLDSTUFF=$(find /schroots/ -maxdepth 1 -type d -regextype posix-extended -regex "$regex" -mtime +2 -exec ls -lad {} \; || true)
404 405 406 407 408 409
	if [ ! -z "$OLDSTUFF" ] ; then
		echo
		echo "Warning: Tried, but failed to delete these:"
		echo "$OLDSTUFF"
		echo "Manual cleanup needed!"
	fi
410
	echo
411
	DIRTY=true
412 413
fi

414 415
# find very old schroots
echo "$(date -u) - Detecting schroots older than 1 month"
416
# the reproducible-archlinux schroot is ignored because its ment to be long living
417
OLDSTUFF=$(find /schroots/ -mindepth 1 -maxdepth 1 -mtime +30 -exec ls -lad {} \; | grep -v reproducible-archlinux | true)
418 419 420 421 422 423 424 425 426 427
if [ ! -z "$OLDSTUFF" ]; then
	echo
	echo "Warning: schroots older than 1 month found in /schroot:"
	echo "$OLDSTUFF"
	echo
	echo "Manual cleanup needed!"
	echo
	DIRTY=true
fi

428
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
429 430 431 432 433 434
	#
	# find failed builds due to network problems and reschedule them
	#
	# only grep through the last 5h (300 minutes) of builds...
	# (ignore "*None.rbuild.log" because these are build which were just started)
	# this job runs every 4h
435
	echo "$(date -u) - Rescheduling failed builds due to network issues."
436
	FAILED_BUILDS=$(find $DEBIAN_BASE/rbuild -type f ! -name "*None.rbuild.log" ! -mmin +300 -exec zgrep -l -E 'E: Failed to fetch.*(Unable to connect to|Connection failed|Size mismatch|Cannot initiate the connection to|Bad Gateway|Service Unavailable)' {} \; || true)
437 438 439 440 441 442
	if [ ! -z "$FAILED_BUILDS" ] ; then
		echo
		echo "The following builds have failed due to network problems and will be rescheduled now:"
		echo "$FAILED_BUILDS"
		echo
		echo "Rescheduling packages: "
443 444
		REQUESTER="jenkins maintenance job"
		REASON="maintenance reschedule: reschedule builds which failed due to network errors"
445 446 447
		for SUITE in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f9 | sort -u) ; do
			for ARCH in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f10 | sort -u) ; do
				CANDIDATES=$(for PKG in $(echo $FAILED_BUILDS | sed "s# #\n#g" | grep "/$SUITE/$ARCH/" | cut -d "/" -f11 | cut -d "_" -f1) ; do echo "$PKG" ; done)
448 449 450 451 452
				# double check those builds actually failed
				TO_SCHEDULE=""
				for pkg in $CANDIDATES ; do
					QUERY="SELECT s.name FROM sources AS s JOIN results AS r ON r.package_id=s.id
						   WHERE s.suite='$SUITE' AND s.architecture='$ARCH' AND (r.status='FTBFS' OR r.status='depwait') AND s.name='$pkg'"
453
					TO_SCHEDULE=${TO_SCHEDULE:+"$TO_SCHEDULE "}$(query_db "$QUERY")
454 455
				done
				schedule_packages $TO_SCHEDULE
456
			done
457
		done
458 459 460
		DIRTY=true
	fi

461 462 463 464 465 466 467
	#
	# find failed builds due to diffoscope schroot problems and reschedule them
	#
	# only grep through the last 5h (300 minutes) of builds...
	# (ignore "*None.rbuild.log" because these are build which were just started)
	# this job runs every 4h
	echo "$(date -u) - Rescheduling failed builds due to diffoscope schroot issues."
468
	FAILED_BUILDS=$(find $DEBIAN_BASE/rbuild -type f ! -name "*None.rbuild.log" ! -mmin +300 -exec zgrep -l -F 'E: 10mount: error: Directory' {} \; || true)
469 470 471 472 473 474 475 476
	if [ ! -z "$FAILED_BUILDS" ] ; then
		echo
		echo "Warning: The following builds have failed due to diffoscope schroot problems and will be rescheduled now:"
		echo "$FAILED_BUILDS"
		echo
		echo "Rescheduling packages: "
		REQUESTER="jenkins maintenance job"
		REASON="maintenance reschedule: reschedule builds which failed due to diffoscope schroot errors"
477 478 479
		for SUITE in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f9 | sort -u) ; do
			for ARCH in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f10 | sort -u) ; do
				CANDIDATES=$(echo $FAILED_BUILDS | sed "s# #\n#g" | grep "/$SUITE/$ARCH/" | cut -d "/" -f11 | cut -d "_" -f1 | xargs)
480
				if [ ! -z "$CANDIDATES" ]; then
481
					schedule_packages $CANDIDATES
482
				fi
483 484 485 486 487
			done
		done
		DIRTY=true
	fi

488 489 490
	#
	# find packages which build didnt end correctly
	#
491
	echo "$(date -u) - Rescheduling builds which didn't end correctly."
492
	DATE=$(date '+%Y-%m-%d %H:%M' -d "-2 days")
493 494 495 496
	QUERY="
		SELECT s.id, s.name, p.date_scheduled, p.date_build_started
			FROM schedule AS p JOIN sources AS s ON p.package_id=s.id
			WHERE p.date_scheduled != ''
497
			AND p.date_build_started IS NOT NULL
498
			AND p.date_build_started < '$DATE'
499 500 501
			ORDER BY p.date_scheduled
		"
	PACKAGES=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXXX)
502
	query_db "$QUERY" > $PACKAGES 2> /dev/null || echo "Warning: SQL query '$QUERY' failed."
503 504
	if grep -q '|' $PACKAGES ; then
		echo
505
		echo "Packages found where the build was started more than 48h ago:"
506
		query_to_print "$QUERY" 2> /dev/null || echo "Warning: SQL query '$QUERY' failed."
507 508
		echo
		for PKG in $(cat $PACKAGES | cut -d "|" -f1) ; do
509 510
			echo "query_db \"UPDATE schedule SET date_build_started = NULL, job = NULL WHERE package_id = '$PKG';\""
			query_db "UPDATE schedule SET date_build_started = NULL, job = NULL WHERE package_id = '$PKG';"
511
		done
512
		echo "Packages have been rescheduled."
513 514 515 516 517 518 519 520
		echo
		DIRTY=true
	fi
	rm $PACKAGES

	#
	# find packages which have been removed from the archive
	#
521
	echo "$(date -u) - Looking for packages which have been removed from the archive."
522 523 524
	PACKAGES=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXX)
	QUERY="SELECT name, suite, architecture FROM removed_packages
			LIMIT 25"
525
	query_db "$QUERY" > $PACKAGES 2> /dev/null || echo "Warning: SQL query '$QUERY' failed."
526 527 528 529 530
	if grep -q '|' $PACKAGES ; then
		DIRTY=true
		echo
		echo "Found files relative to old packages, no more in the archive:"
		echo "Removing these removed packages from database:"
531
		query_to_print "$QUERY" 2> /dev/null || echo "Warning: SQL query '$QUERY' failed."
532 533 534 535 536 537 538
		echo
		for pkg in $(cat $PACKAGES) ; do
			PKGNAME=$(echo "$pkg" | cut -d '|' -f 1)
			SUITE=$(echo "$pkg" | cut -d '|' -f 2)
			ARCH=$(echo "$pkg" | cut -d '|' -f 3)
			QUERY="DELETE FROM removed_packages
				WHERE name='$PKGNAME' AND suite='$SUITE' AND architecture='$ARCH'"
539
			query_db "$QUERY"
540
			cd $DEBIAN_BASE
541
			find rb-pkg/$SUITE/$ARCH rbuild/$SUITE/$ARCH dbd/$SUITE/$ARCH dbdtxt/$SUITE/$ARCH buildinfo/$SUITE/$ARCH logs/$SUITE/$ARCH logdiffs/$SUITE/$ARCH -name "${PKGNAME}_*" | xargs -r rm -v || echo "Warning: couldn't delete old files from ${PKGNAME} in $SUITE/$ARCH"
542 543 544 545 546 547
		done
		cd - > /dev/null
	fi
	rm $PACKAGES

	#
548
	# delete jenkins html logs from reproducible_builder_(fedora|archlinux)* jobs as they are mostly redundant
549 550 551 552 553 554 555 556
	# (they only provide the extended value of parsed console output, which we dont need here.)
	#
	OLDSTUFF=$(find /var/lib/jenkins/jobs/reproducible_builder_* -maxdepth 3 -mtime +0 -name log_content.html  -exec rm -v {} \; | wc -l)
	if [ ! -z "$OLDSTUFF" ] ; then
		echo
		echo "Removed $OLDSTUFF jenkins html logs."
		echo
	fi
557

558 559
fi

560
# find+terminate processes which should not be there
561
echo "$(date -u) - Looking for processes which should not be there."
562 563 564
HAYSTACK=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
RESULT=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
TOKILL=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
565
PBUIDS="1234 1111 2222"
566
ps axo pid,user,size,pcpu,cmd > $HAYSTACK
567
for i in $PBUIDS ; do
568
	for PROCESS in $(pgrep -u $i -P 1 || true) ; do
569
		# faked-sysv comes and goes...
570
		grep ^$PROCESS $HAYSTACK | grep -v faked-sysv >> $RESULT 2> /dev/null || true
571
	done
572 573
done
if [ -s $RESULT ] ; then
574
	for PROCESS in $(cat $RESULT | cut -d " " -f1 | grep -v ^UID | xargs echo) ; do
575
		AGE=$(ps -p $PROCESS -o etimes= || echo 0)
576 577
		# a single build may take day, so... (first build: 18h, 2nd: 24h)
		if [ $AGE -gt $(( 24*60*60 )) ] ; then
578
			echo "$PROCESS" >> $TOKILL
579 580
		fi
	done
581 582
	if [ -s $TOKILL ] ; then
		DIRTY=true
583
		PSCALL=""
584
		echo
585
		echo "Info: processes found which should not be there, killing them now:"
586 587 588 589
		for PROCESS in $(cat $TOKILL) ; do
			PSCALL=${PSCALL:+"$PSCALL,"}"$PROCESS"
		done
		ps -F -p $PSCALL
590
		echo
591
		for PROCESS in $(cat $TOKILL) ; do
592 593
			sudo kill -9 $PROCESS 2>&1
			echo "'kill -9 $PROCESS' done."
594 595 596
		done
		echo
	fi
597
fi
598
rm $HAYSTACK $RESULT $TOKILL
599 600
# There are naughty processes spawning childs and leaving them to their grandparents
PSCALL=""
601
for i in $PBUIDS ; do
602 603
	for p in $(pgrep -u $i) ; do
		AGE=$(ps -p $p -o etimes= || echo 0)
604 605
		# let's be generous and consider 26 hours here...
		if [ $AGE -gt $(( 26*60*60 )) ] ; then
606
			sudo kill -9 $p 2>&1 || (echo "Could not kill:" ; ps -F -p "$p")
607
			sleep 2
608 609 610 611 612
			# check it's gone
			AGE=$(ps -p $p -o etimes= || echo 0)
			if [ $AGE -gt $(( 14*60*60 )) ] ; then
				PSCALL=${PSCALL:+"$PSCALL,"}"$p"
			fi
613 614 615 616
		fi
	done
done
if [ ! -z "$PSCALL" ] ; then
617
	echo -e "Warning: processes found which should not be there and which could not be killed. Please fix up manually:"
618 619 620
	ps -F -p "$PSCALL"
	echo
fi
621

622
# find builds which should not be there
623 624 625 626 627 628 629 630
# (not on i386 as we start builds differently here… work in progress)
if [ "$ARCH" != "i386" ] ; then
	RESULTS=$(pgrep -f reproducible_build.sh --parent 1 || true)
	if [ ! -z "$RESULTS" ] ; then
		DIRTY=true
		echo "Warning: found reproducible_build.sh processes which have pid 1 as parent (and not sshd), thus something went wrong… please investigate."
		echo -e "$RESULTS"
	fi
631
fi
632

633 634
# remove artifacts older than a day
echo "$(date -u) - Checking for artifacts older than a day."
635
ARTIFACTS=$(find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
636 637
if [ ! -z "$ARTIFACTS" ] ; then
	echo
638
	echo "Removed old artifacts:"
639
	find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
640 641 642
	echo
fi

643
# find + chmod files with bad permissions
644
echo "$(date -u) - Checking for files with bad permissions."
645 646
# automatically fix rbuild files with wrong permissions...
# (we know it happens (very rarely) but... shrugs.)
647
[ ! -d $DEBIAN_BASE/rbuild ] || find $DEBIAN_BASE/rbuild ! -perm 644 -type f -exec chmod -v 644 {} \; 2>/dev/null|| true
648
BADPERMS=$(find $DEBIAN_BASE/{buildinfo,dbd,artifacts,stretch,buster,unstable,experimental,rb-pkg} ! -perm 644 -type f 2>/dev/null|| true)
649
if [ ! -z "$BADPERMS" ] ; then
650 651 652 653 654 655 656 657
    DIRTY=true
    echo
    echo "Warning: Found files with bad permissions (!=644):"
    echo "Please fix permission manually"
    echo "$BADPERMS" | xargs echo chmod -v 644
    echo
fi

658
# daily mails
659
if [ "$HOSTNAME" = "$MAINNODE" ] && [ $(date -u +%H) -eq 0 ]  ; then
660
	# once a day, send mail about builder problems
661
	files_to_mail=(
662 663
		/var/log/jenkins/reproducible-builder-errors.log
		/var/log/jenkins/reproducible-stale-builds.log
664
		/var/log/jenkins/reproducible-archlinux-stale-builds.log
665 666 667
		/var/log/jenkins/reproducible-race-conditions.log
		/var/log/jenkins/reproducible-diskspace-issues.log
		/var/log/jenkins/reproducible-remote-error.log
668
		/var/log/jenkins/reproducible-scheduler.log
669 670 671
		/var/log/jenkins/reproducible-env-changes.log
		/var/log/jenkins/reproducible-submit2buildinfo.debian.net.log
		/var/log/postgresql/postgresql-9.6-main.log
672 673
	)
	for PROBLEM in "${files_to_mail[@]}" ; do
674 675
		if [ -s $PROBLEM ] ; then
			TMPFILE=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXXX)
676
			if [ "$(dirname $PROBLEM)" = "/var/log/jenkins" ] ; then
677 678 679 680 681
				if [ "$(basename $PROBLEM)" = "reproducible-diskspace-issues.log" ]; then
					echo "diskspace issues should always be investigated." > $TMPFILE
				fi
				if grep -q https $PROBLEM ; then
					echo "$(grep -c https $PROBLEM) entries found:"
682
					if [ "$(basename $PROBLEM)" != "reproducible-remote-error.log" ] && [ "$(basename $PROBLEM)" != "reproducible-race-conditions.log" ] ; then
683 684 685 686
						OTHERPROJECTS=""
					else
						OTHERPROJECTS="archlinux fedora"
					fi
687
					echo "$(grep -c https $PROBLEM || echo 0) entries found:" >> $TMPFILE
688
					for a in $ARCHS $OTHERPROJECTS; do
689
						echo "- $(grep https $PROBLEM|grep -c ${a}_) from $a." >> $TMPFILE
690 691 692 693 694 695 696 697
					done
				elif grep -q 'stale builds found' $PROBLEM ; then
					echo "$(grep -c 'stale builds found' $PROBLEM || echo 0) entries found:" >> $TMPFILE
					for a in $ARCHS ; do
							echo "- $(grep -c ${a}_ $PROBLEM) from $a." >> $TMPFILE
					done
				fi
				echo >> $TMPFILE
698
				# maybe we should use logrotate for our jenkins logs too…
699 700
				cat $PROBLEM >> $TMPFILE
				rm $PROBLEM
701
			else
702 703
				# regular logfile, logrotate is used (and the file ain't owned by jenkins)
				# only care for yesterday's entries:
704 705
				( grep $(date -u -d "1 day ago" '+%Y-%m-%d') $PROBLEM || echo "no problems yesterday…" ) > $TMPFILE
			fi
706 707
			# send mail if we found issues
			if [ -s $TMPFILE ] && ! grep -q "no problems yesterday…" $TMPFILE ; then
708 709 710
				if [ "$(basename $PROBLEM)" = "reproducible-submit2buildinfo.debian.net.log" ]; then
					CC="-c lamby@debian.org"
				fi
711
				cat $TMPFILE | mail -s "$(basename $PROBLEM) found" ${CC:-} qa-jenkins-scm@lists.alioth.debian.org
712
				CC=""
713
			fi
714 715 716
			rm -f $TMPFILE
		fi
	done
717 718 719 720
	# once a day, send notifications to package maintainers
	cd /srv/reproducible-results/notification-emails
	for NOTE in $(find . -type f) ; do
			TMPFILE=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXXX)
721
			PKG=$(basename $NOTE)
722
			mv $NOTE $TMPFILE
723
			cat $TMPFILE | mail -s "$PKG: status change on tests.reproducible-builds.org/debian" \
724
				-a "From: Reproducible builds folks <reproducible-builds@lists.alioth.debian.org>" \
725
				-a "X-Reproducible-Builds-Pkg: $PKG" \
726
				 $PKG@packages.debian.org
727 728
			rm -f $TMPFILE
	done
729 730
fi

731
if ! $DIRTY ; then
732
	echo "$(date -u ) - Everything seems to be fine."
733 734
	echo
fi
735

736
echo "$(date -u) - the end."