reproducible_maintenance.sh 31.9 KB
Newer Older
1
#!/bin/bash
2
# vim: set noexpandtab:
3

4
# Copyright 2014-2022 Holger Levsen <holger@layer-acht.org>
5
#         © 2015-2021 Mattia Rizzolo <mattia@debian.org>
Holger Levsen's avatar
Holger Levsen committed
6
# released under the GPLv2
7

8
DEBUG=false
9
10
11
. /srv/jenkins/bin/common-functions.sh
common_init "$@"

12
# common code for tests.reproducible-builds.org
13
14
. /srv/jenkins/bin/reproducible_common.sh

Holger Levsen's avatar
Holger Levsen committed
15
16
17
18
#
# we fail hard
#
set -e
19
20
21

# query reproducible database, print output
query_to_print() {
22
	printf "$(psql -c "$@")"
23
24
}

Holger Levsen's avatar
Holger Levsen committed
25
26
27
28
# define some variables before we start
DIRTY=false
REP_RESULTS=/srv/reproducible-results

29
30
31
32
33
34
35
36
37
38
39
#
# find too large files in /var/log
#
echo "$(date -u) - Looking for too large files in /var/log/"
TOOBIG=$(find /var/log -size +8G -exec ls -lah {} \; 2>/dev/null || true)
if [ ! -z "$TOOBIG" ] ; then
	echo
	echo "$(date -u) - Warning: too large files found in /var/log:"
	echo "$TOOBIG"
	echo
	DIRTY=true
40
	if [ -n "$(find /var/log -size +32G 2> >(grep -v 'Permission denied'))" ] ; then
41
42
43
44
45
		echo "$(date -u) - Error, more than 32gb is just wrong..."
		exit 1
	fi
fi

46
#
47
# delete old temp directories in $REP_RESULTS/rbuild-debian
48
#
49
echo "$(date -u) - Deleting temp directories in $REP_RESULTS/rbuild-debian, older than 3 days."
50
OLDSTUFF=$(find $REP_RESULTS/rbuild-debian -maxdepth 1 -type d -mtime +2 -name "tmp.*" -exec ls -lad {} \; 2>/dev/null|| true)
51
52
53
if [ ! -z "$OLDSTUFF" ] ; then
	echo
	echo "Old temp directories found in $REP_RESULTS/rbuild-debian"
54
	find $REP_RESULTS/rbuild-debian -maxdepth 1 -type d -mtime +2 -name "tmp.*" -exec rm -rv --one-file-system {} \; || true
55
56
57
58
59
	echo "These old directories have been deleted."
	echo
	DIRTY=true
fi

60
#
61
# delete old temp directories in /tmp (probably only useful on osuosl171+172 & 167[for janitor])
62
63
#
echo "$(date -u) - Deleting temporary directories in /tmp, older than 3 days."
64
OLDSTUFF=$(find /tmp -maxdepth 1 -type d -mtime +2 -regextype egrep -regex '/tmp/(tmp.*|Test.*|usession-release.*|.*test.*|janitor.{8})' -exec ls -lad {} \; || true)
65
66
67
if [ ! -z "$OLDSTUFF" ] ; then
	echo
	echo "Old temp directories found in /tmp"
68
	find /tmp -maxdepth 1 -type d -mtime +2 -regextype egrep -regex '/tmp/(tmp.*|Test.*|usession-release.*|.*test.*|janitor.{8})' -exec sudo rm -rv --one-file-system {} \; || true
69
70
71
72
73
	echo "These old directories have been deleted."
	echo
	DIRTY=true
fi

74
75
76
77
78
79
80
81
82
83
#
# delete old pbuilder build directories
#
if [ -d /srv/workspace/pbuilder/ ] ; then
	echo "$(date -u) - Deleting pbuilder build directories, older than 3 days."
	OLDSTUFF=$(find /srv/workspace/pbuilder/ -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +2 -exec ls -lad {} \; || true)
	if [ ! -z "$OLDSTUFF" ] ; then
		echo
		echo "Old pbuilder build directories found in /srv/workspace/pbuilder/"
		echo -n "$OLDSTUFF"
84
		( find /srv/workspace/pbuilder/ -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +2 -exec sudo rm -rf --one-file-system {} \; ) || true
85
86
87
88
89
		echo
		DIRTY=true
	fi
fi

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#
# delete old temp directories $REP_RESULTS/(archlinuxrb-build|rbuild-openwrt-results)-????????
#
echo "$(date -u) - Deleting temp directories in $REP_RESULTS/rbuild-debian, older than 3 days."
OLDSTUFF=$(find $REP_RESULTS/ -maxdepth 1 -type d -mtime +2 -regextype awk -regex "$REP_RESULTS/(archlinuxrb-build|rbuild-openwrt-results)-........" -exec ls -lad {} \; 2>/dev/null|| true)
if [ ! -z "$OLDSTUFF" ] ; then
	echo
	echo "Old archlinuxrb-build and rbuild-openwrt-results temp directories found in $REP_RESULTS/"
	find $REP_RESULTS/ -maxdepth 1 -type d -mtime +2 -regextype awk -regex "$REP_RESULTS/(archlinuxrb-build|rbuild-openwrt-results)-........" -exec rm -rv --one-file-system {} \; || true
	echo "These old directories have been deleted."
	echo
	DIRTY=true
fi


105
106
107
108
109
#
# delete old chroot-installation directories (not related to reproducible builds)
#
if [ -d /srv/workspace/chroots/ ] ; then
	echo "$(date -u) - Deleting chroots build directories, older than 7 days."
110
	OLDSTUFF=$(find /srv/workspace/chroots/ -maxdepth 2 -name 'chroot-installation*' -type d -mtime +6 -exec ls -lad {} \; || true)
111
112
113
114
	if [ ! -z "$OLDSTUFF" ] ; then
		echo
		echo "Old chroot-installation directories found in /srv/workspace/chroots/"
		echo -n "$OLDSTUFF"
115
		( find /srv/workspace/chroots/ -maxdepth 2 -name 'chroot-installation*' -type d -mtime +6 -exec sudo rm -rf --one-file-system {} \; ) || true
116
117
118
119
120
		echo
		DIRTY=true
	fi
fi

121
122
123
#
# delete old temp directories for live-build
#
124
125
echo "$(date -u) - Deleting temp result directories for live-build, older than 1 day."
OLDSTUFF=$(find $REP_RESULTS -maxdepth 1 -type d -mtime +0 -name "live-build-*" -exec ls -lad {} \; 2>/dev/null|| true)
126
127
if [ ! -z "$OLDSTUFF" ] ; then
	echo
128
	echo "Old temp directories for live-build found in $REP_RESULTS"
129
	echo -n "$OLDSTUFF"
130
	find $REP_RESULTS -maxdepth 1 -type d -mtime +0 -name "live-build-*" -exec rm -rf --one-file-system {} \; || true
131
	echo "These old temp directories for live-build have been deleted."
132
133
134
	echo
	DIRTY=true
fi
135
136
echo "$(date -u) - Deleting temp workspace directories for live-build, older than 1 day."
OLDSTUFF=$(find /srv/workspace/live-build -maxdepth 1 -type d -mtime +0 -name "*.*" -exec ls -lad {} \; 2>/dev/null|| true)
137
138
if [ ! -z "$OLDSTUFF" ] ; then
	echo
139
	echo "Old temp workspace directories for live-build found in /srv/workspace/live-build"
140
141
	echo -n "$OLDSTUFF"
	find /srv/workspace/live-build -maxdepth 1 -type d -mtime +0 -name "*.*" -exec sudo rm -rf --one-file-system {} \; || true
142
	echo "These old temp workspace directories have been deleted."
143
144
145
146
	echo
	DIRTY=true
fi

147
148
149
150
151
152
#
# check for working proxy
#
echo "$(date -u) - testing whether the proxy works..."
curl http://www.debian.org > /dev/null
if [ $? -ne 0 ] ; then
153
154
	echo "Error: curl http://www.debian.org failed, probably the proxy is down for $HOSTNAME"
	exit 1
155
156
fi

157
158
159
160
161
162
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
	#
	# find nodes with problems and temporarily turn them offline
	#
	echo "$(date -u) - Looking for unhealthy nodes."
	cd ~/jobs
163
	DUMMY_FILE=$(mktemp --tmpdir=$TMPDIR maintenance-XXXXXXX)
164
	SICK=""
165
166
	for i in reproducible_node_health_check_* reproducible_maintenance_* ; do
		case $i in
167
168
169
170
			reproducible_node_health_check_amd64_jenkins|reproducible_maintenance_amd64_jenkins)
				echo "Skipping $i..."
				continue
				;;
171
172
173
			reproducible_node_health_check_*)
				NODE_ALIAS=$(echo $i | cut -d '_' -f6)
				NODE_ARCH=$(echo $i | cut -d '_' -f5)
174
175
				FORCE_DATE=$(date -u -d "3 hour ago" '+%Y-%m-%d %H:%M')
				MAXDIFF=12
176
177
178
179
				;;
			reproducible_maintenance_*)
				NODE_ALIAS=$(echo $i | cut -d '_' -f4)
				NODE_ARCH=$(echo $i | cut -d '_' -f3)
180
181
				FORCE_DATE=$(date -u -d "8 hour ago" '+%Y-%m-%d %H:%M')
				MAXDIFF=3
182
183
184
				;;
		esac
		touch -d "$FORCE_DATE" $DUMMY_FILE
185
		case $NODE_ARCH in
186
187
			amd64)
				case "$NODE_ALIAS" in
188
					ionos*)		NODE="$NODE_ALIAS-amd64.debian.net" ;;
189
					osuosl*)	NODE="osuosl${NODE_ALIAS#osuosl}-amd64.debian.net" ;;
190
				esac ;;
191
			i386)	NODE="$NODE_ALIAS-i386.debian.net" ;;
192
			arm64)	NODE="codethink${NODE_ALIAS#codethink}-arm64.debian.net" ;;
193
194
			armhf)	NODE="${NODE_ALIAS}-armhf-rb.debian.net" ;;
		esac
195
		case "$NODE" in
196
			ionos9-amd64.debian.net|ionos10-amd64.debian.net)
197
				# ionos9 and ionos10 are not used for r-b and sometimes are too busy
198
				# to run healthcheck / maintenance jobs
199
200
201
202
				echo "Skipping ${NODE}..."
				continue
				;;
		esac
203
204
		cd $i/builds
		LAST=$(ls -rt1 | tail -1)
205
		GOOD=$(awk '/^lastSuccessfulBuild/ {print $2}' permalinks)
Holger Levsen's avatar
Holger Levsen committed
206
		if [ "$LAST" = "$GOOD" ] ; then
207
208
209
210
211
			DIFF=0
		else
			let DIFF=$LAST-$GOOD || DIFF=-1
		fi
		if [ $DIFF -eq -1 ] ; then
212
			echo "Warning: Problems analysing $i build logs, ignoring $NODE."
213
		# either the diff is greater than $MAXDIFF (=the last $MAXDIFF job runs failed)
214
		# or the last successful run is older than an hour (=a job is still running/hanging)
215
		elif [ $DIFF -gt $MAXDIFF ] || [ $LAST -ot $DUMMY_FILE ] ; then
216
			echo -n "$i job has issues since more than an hour"
Holger Levsen's avatar
Holger Levsen committed
217
			if grep -q $NODE $JENKINS_OFFLINE_LIST >/dev/null 2>&1 ; then
218
219
				echo " and $NODE already marked as offline, good."
			else
Holger Levsen's avatar
Holger Levsen committed
220
				echo $NODE >> $JENKINS_OFFLINE_LIST
221
				echo " so $NODE has (temporarily) been marked as offline now."
222
				SICK="$SICK $NODE"
223
224
225
226
227
228
			fi
		else
			echo "$NODE is doing fine, good."
		fi
		cd ../..
	done
229
	if [ -n "$SICK" ] ; then
230
		SICK=$(echo "$SICK" | sed 's#.debian.net##g' | sed 's#-rb##g' | sed 's# ##' )
231
		if echo "$SICK" | grep -q ' ' 2>/dev/null ; then
232
			SICK=$(echo "$SICK" | sed 's# # and #g')
233
			MESSAGE="$SICK have health problems and have temporarily been marked as offline."
234
		else
235
			MESSAGE="$SICK has health problems and has temporarily been marked as offline."
236
		fi
237
238
		MESSAGE="$MESSAGE To make this permanent, edit jenkins-home/offline_nodes in git."
		RECIPIENTS="mattia@debian.org holger@debian.org"
Holger Levsen's avatar
Holger Levsen committed
239
		if $(echo -e "$MESSAGE" | grep -q armhf 2>/dev/null) ; then
240
241
242
			RECIPIENTS="$RECIPIENTS vagrant@reproducible-builds.org"
		fi
		for TO in $RECIPIENTS ; do
243
			echo -e "$MESSAGE" | mail -s "jenkins nodes temporarily marked offline" $TO
244
245
		done

246
	fi
247
	rm -f $DUMMY_FILE
248
249
fi

250
echo "$(date -u) - updating the chdists, schroots and pbuilder now..."
Holger Levsen's avatar
Holger Levsen committed
251
# use host architecture (only)
252
ARCH=$(dpkg --print-architecture)
Holger Levsen's avatar
Holger Levsen committed
253
254
255
256
# use host apt proxy configuration for pbuilder
if [ ! -z "$http_proxy" ] ; then
	pbuilder_http_proxy="--http-proxy $http_proxy"
fi
257
for s in $SUITES ; do
258
	for i in osuosl ionos3 ionos7 ionos9 ionos10 ; do
259
260
261
262
263
		if [ "${HOSTNAME:0:${#i}}" = "$i" ]; then
			# this node is not used to do Debian rebuilds, skip it all
			continue 2
		fi
	done
264
265
266
267
268
269
270
271
272
273
274
	#
	# chdist update
	#
	distname="$s-$ARCH"
	echo "$(date -u) - updating the $s/$ARCH chdist now."
	if [ ! -d "$CHPATH/$distname" ]; then
		echo "$(date -u) - chdist not existing, creating one now..."
		if ! chdist --data-dir="$CHPATH" --arch="$ARCH" create "$distname" "$MIRROR" "$s" main ; then
			echo "Error: failed to create the $s/$ARCH chdist."
			exit 1
		fi
275
		. /srv/jenkins/bin/jenkins_node_definitions.sh
Holger Levsen's avatar
Holger Levsen committed
276
		get_node_information "$HOSTNAME"
277
278
279
280
		if "$NODE_RUN_IN_THE_FUTURE" ; then
			echo "This node is reported to run in the future, configuring APT to ignore the Release file expiration..."
			echo 'Acquire::Check-Valid-Until "false";' > "$CHPATH/$distname/etc/apt/apt.conf.d/398future"
		fi
281
	fi
282
	if ! chdist --data-dir="$CHPATH" apt-get "$distname" -q update ; then
283
284
285
		echo "Warning: failed to update the $s/$ARCH chdist."
		DIRTY=true
	fi
286
287
288
	#
	# pbuilder update
	#
289
	# pbuilder aint used on jenkins anymore
290
	if [ "$HOSTNAME" = "$MAINNODE" ] ; then
291
292
293
294
		continue
	else
		echo "$(date -u) - updating pbuilder for $s/$ARCH now."
	fi
295
	for i in 1 2 3 4 ; do
296
		[ ! -f /var/cache/pbuilder/$s-reproducible-base.tgz ] || sudo pbuilder --update $pbuilder_http_proxy --basetgz /var/cache/pbuilder/$s-reproducible-base.tgz
297
298
		RESULT=$?
		if [ $RESULT -eq 1 ] ; then
299
			# sleep 61-120 secs
300
			echo "Sleeping some time... (to workaround network problems like 'Hash Sum mismatch'...)"
301
			/bin/sleep $(echo "scale=1 ; ($(shuf -i 1-600 -n 1)/10)+60" | bc )
302
303
			echo "$(date -u) - Retrying to update pbuilder for $s/$ARCH."
		elif [ $RESULT -eq 0 ] ; then
304
			break
305
306
307
308
309
310
311
312
313
		fi
	done
	if [ $RESULT -eq 1 ] ; then
		echo "Warning: failed to update pbuilder for $s/$ARCH."
		DIRTY=true
	fi
done
set -e

314
315
316
# for alpine
set +e
case $HOSTNAME in
317
	osuosl184*|osuosl170*|jenkins)
318
319
		echo "$(date -u) - updating alpine schroot now."
		schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk update
320
		schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk upgrade
321
322
323
324
325
326
327
328
329
330
331
332
		RESULT=$?
		if [ $RESULT -eq 1 ] ; then
			echo "Warning: failed to update alpine schroot."
			DIRTY=true
		else
			echo "$(date -u) - updating alpine schroot done."
		fi
		;;
	*)	;;
esac
set -e

333
# for Arch Linux
334
set +e
335
case $HOSTNAME in
336
	osuosl184*|osuosl170*|jenkins)
337
338
339
340
		echo "$(date -u) - updating Arch Linux schroot now."
		schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- pacman -Syu --noconfirm
		RESULT=$?
		if [ $RESULT -eq 1 ] ; then
341
342
343
344
345
346
			echo "Let's see if /var/lib/pacman/db.lck exists in the schroot..."
			if [ "$(schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- ls /var/lib/pacman/db.lck)" = "/var/lib/pacman/db.lck" ] ; then
				echo "Warning: failed to update Arch Linux schroot, pacman/db.lck exists."
			else
				echo "Warning: failed to update Arch Linux schroot."
			fi
347
			DIRTY=true
348
349
350
351
352
353
		else
			echo "$(date -u) - updating Arch Linux schroot done."
		fi
		;;
	*)	;;
esac
354
set -e
355

356
# delete build services logfiles
357
dir=/var/lib/jenkins/userContent/reproducible/debian/build_service/
358
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
359
	if [ -d $dir ] ; then
360
		echo "$(date -u) - Deleting logfiles from build services directories, older than a day."
361
		OLDSTUFF=$(find $dir -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +0 -exec ls -lad {} \; || true)
362
363
		if [ ! -z "$OLDSTUFF" ] ; then
			echo
364
			echo "Old logfiles cleaned in $dir"
Holger Levsen's avatar
Holger Levsen committed
365
			echo -n "$OLDSTUFF" | sed "s#$dir#./#g"
366
367
			# we make sure to actually only delete console.log.gz older than a day
			# other stuff we only delete after two days (in case a build is running more than 24h...)
368
369
			find $dir -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +0 -name console.log.gz -exec rm -rf --one-file-system {} \; || true
			find $dir -maxdepth 2 -regex '.*/[0-9]+' -type d -mtime +1 -exec rm -rf --one-file-system {} \; || true
370
371
			echo
		fi
372
373
374
	fi
fi

375
# remove too old schroot sessions
376
echo "$(date -u) - Removing schroot sessions older than 3 days."
377
dir=/var/lib/schroot/unpack/
378
OLDSTUFF=$(find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +2 -exec ls -lad {} \;)
379
380
if [ ! -z "$OLDSTUFF" ]; then
	echo
381
	echo "schroot sessions older than 3 days found, which will be deleted:"
382
383
384
	echo "$OLDSTUFF"
	echo
	for s in $(find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +2 -print0 | xargs -0 -r basename -a); do
385
386
		echo "$(date -u) - trying to end schroot sessions $s..."
		schroot -c "$s" --end-session || true
387
	done
388
	OLDSTUFF=$(find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +2 -exec ls -lad {} \;)
389
390
	if [ ! -z "$OLDSTUFF" ]; then
		echo
391
		echo "Warning: Tried, but failed to delete these schroot sessions:"
392
		echo "$OLDSTUFF"
393
		echo "Manual cleanup needed."
394
395
396
397
	fi
	echo
	DIRTY=true
fi
398

399
# remove too old schroot session data from diffoscope
400
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
401
	echo "$(date -u) - Removing diffoscope schroot session data older than 2 days."
402
403
404
405
	mapfile -t OLDSTUFF < <(find /var/lib/schroot/session -name "jenkins-reproducible-*-diffoscope-*" -type f -mtime +1 -exec ls -lad {} \;)
	if [ "${#OLDSTUFF[@]}" -ne 0 ]; then
		echo
		echo "Found old schroot (from diffoscope) sessions, which will be terminated now:"
406
		printf '%s\n' "${OLDSTUFF[@]}"
407
408
409
410
411
412
		echo
		for session_path in "${OLDSTUFF[@]}"; do
			# don't use `basename` because session_path contains the whole output from ls -l, not just the path
			session=${session_path##*/}
			if ! schroot --end-session -c "$session" ; then
				echo "Warning: failed to end schroot session: $session"
413
			DIRTY=true
414
415
416
			fi
		done
	fi
417
	for path in underlay overlay ; do
418
419
420
421
		# these directories can easily have the mtime of when the base schroot
		# was created, so be so be very conservative about removing them, else
		# one risks to delete stuff that is currently in use.
		mapfile -t OLDSTUFF < <(find "/var/lib/schroot/union/$path" -name "jenkins-reproducible-*-diffoscope-*" -maxdepth 1 -type d -mtime +31 -exec ls -lad {} \;)
422
423
424
		if [ ${#OLDSTUFF[@]} -ne 0 ]; then
			echo
			echo "Found old schroot (from diffoscope) $path, which will be investigated now:"
425
			printf '%s\n' "${OLDSTUFF[@]}"
426
			echo
427
			pushd /var/lib/schroot/session >/dev/null
428
429
430
431
432
433
434
			for dir_ls in "${OLDSTUFF[@]}"; do
				# don't use `basename`,etc because dir_ls contains the whole output from ls -l, not just the path
				dir_path="/${dir_ls#*/}"
				mapfile -t matches < <(grep -lF "union-overlay-directory=$dir_path")
				if [ ${#matches[@]} -eq 0 ]; then
					echo
					echo "This $path is not referenced by any existing schroot, deleting it now."
435
436
437
438
439
					if [ -d "$dir_path" ]; then
						set -x
						sudo rm -rf --one-file-system "$dir_path"
						if ! "$DEBUG" ; then set +x ; fi
					fi
440
441
				fi
			done
442
			popd >/dev/null
443
		fi
444
		mapfile -t OLDSTUFF < <(find "/var/lib/schroot/union/$path" -name "jenkins-reproducible-*-diffoscope-*" -maxdepth 1 -type d -mtime +31 -exec ls -lad {} \;)
445
446
447
		if [ ${#OLDSTUFF[@]} -ne 0 ]; then
			echo
			echo "Warning: Tried, but failed to delete these in /var/lib/schroot/union/$dir:"
448
			printf '%s\n' "${OLDSTUFF[@]}"
449
450
451
452
453
			echo "Manual cleanup needed."
			echo
			DIRTY=true
		fi
	done
454
fi
455

Holger Levsen's avatar
Holger Levsen committed
456
457
458
459
460
461
462
# cleanup old schroots
cleanup_old_schroots() {
	local OBJECTS="$1"
	local AGE="$2"
	local regex="$3"
	echo "$(date -u) - Removing $OBJECTS older than $AGE days."
	OLDSTUFF=$(find /schroots/ -maxdepth 1 -type d -regextype posix-extended -regex "$regex" -mtime +$AGE -exec ls -lad {} \; || true)
463
	if [ ! -z "$OLDSTUFF" ] ; then
Holger Levsen's avatar
Holger Levsen committed
464
465
		# try to unmount mounts first
		for MP in $(find /schroots/ -maxdepth 1 -type d -regextype posix-extended -regex "$regex" -mtime +$AGE ) ; do
466
467
			if [ -d $MP/proc ] && mountpoint -q "$MP/proc" ; then
				sudo umount -l $MP/proc 2>/dev/null || echo "umount -l $MP/proc failed, continuing."
Holger Levsen's avatar
Holger Levsen committed
468
469
			fi
		done
470
		echo
471
		echo "$OBJECTS older than $AGE found in /schroots, which will be deleted:"
Holger Levsen's avatar
Holger Levsen committed
472
		find /schroots/ -maxdepth 1 -type d -regextype posix-extended -regex "$regex" -mtime +$AGE -exec sudo rm -rf --one-file-system {} \; || true
473
		echo "$OLDSTUFF"
Holger Levsen's avatar
Holger Levsen committed
474
475
476
477
478
		OLDSTUFF=$(find /schroots/ -maxdepth 1 -type d -regextype posix-extended -regex "$regex" -mtime +$AGE -exec ls -lad {} \; || true)
		if [ ! -z "$OLDSTUFF" ] ; then
			echo
			echo "Warning: Tried, but failed to delete these $OBJECTS:"
			echo "$OLDSTUFF"
479
			echo "Manual cleanup needed."
Holger Levsen's avatar
Holger Levsen committed
480
481
482
		fi
		echo
		DIRTY=true
483
	fi
Holger Levsen's avatar
Holger Levsen committed
484
485
}
cleanup_old_schroots "schroots" 31 "/schroots/(reproducible-.+-[0-9]{1,5}|schroot-install-.+)"
486
487
# FIXME: find a way to not forget this for the bookworm release, when trixie will become the development version
cleanup_old_schroots "unstable and bookworm schroots" 5 "/schroots/reproducible-(unstable|bookworm)-diffoscope-.+-[0-9]{1,5}"
488

489
490
# find very old schroots
echo "$(date -u) - Detecting schroots older than 1 month"
491
# the reproducible-archlinux schroot is ignored because its ment to be long living
Holger Levsen's avatar
Holger Levsen committed
492
OLDSTUFF=$(find /schroots/ -mindepth 1 -maxdepth 1 -mtime +30 -exec ls -lad {} \; | grep -v reproducible-archlinux | true)
493
494
495
496
497
if [ ! -z "$OLDSTUFF" ]; then
	echo
	echo "Warning: schroots older than 1 month found in /schroot:"
	echo "$OLDSTUFF"
	echo
498
	echo "Manual cleanup needed."
499
500
501
502
	echo
	DIRTY=true
fi

503
504
505
# remove too old sbuild directories
dir=/var/lib/sbuild/build
if [ -d $dir ] ; then
506
	echo "$(date -u) - Removing sbuild directories older than 3 days."
507
508
509
	OLDSTUFF=$(find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +6 -exec ls -lad {} \;)
	if [ ! -z "$OLDSTUFF" ]; then
		echo
510
		echo "sbuild directories older than 3 days found, which will be deleted:"
511
512
		echo "$OLDSTUFF"
		echo
513
514
		find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +2 -exec sudo rm -rf --one-file-system {} \;
		OLDSTUFF=$(find "$dir" -mindepth 1 -maxdepth 1 -type d -mtime +2 -exec ls -lad {} \;)
515
516
517
518
		if [ ! -z "$OLDSTUFF" ]; then
			echo
			echo "Warning: Tried, but failed to delete these sbuild directories:"
			echo "$OLDSTUFF"
519
520
			echo
			echo "Manual cleanup needed."
521
522
523
524
525
526
		fi
		echo
		DIRTY=true
	fi
fi

527
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
528
529
530
531
532
533
	#
	# find failed builds due to network problems and reschedule them
	#
	# only grep through the last 5h (300 minutes) of builds...
	# (ignore "*None.rbuild.log" because these are build which were just started)
	# this job runs every 4h
534
	echo "$(date -u) - Rescheduling failed builds due to network issues."
535
	FAILED_BUILDS=$(find $DEBIAN_BASE/rbuild -type f ! -name "*None.rbuild.log" ! -mmin +300 -exec zgrep -l -E 'E: Failed to fetch.*(Unable to connect to|Connection failed|Size mismatch|Cannot initiate the connection to|Bad Gateway|Service Unavailable)' {} \; 2>/dev/null || true)
536
537
538
539
540
541
	if [ ! -z "$FAILED_BUILDS" ] ; then
		echo
		echo "The following builds have failed due to network problems and will be rescheduled now:"
		echo "$FAILED_BUILDS"
		echo
		echo "Rescheduling packages: "
542
543
		REQUESTER="jenkins maintenance job"
		REASON="maintenance reschedule: reschedule builds which failed due to network errors"
544
545
546
		for SUITE in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f9 | sort -u) ; do
			for ARCH in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f10 | sort -u) ; do
				CANDIDATES=$(for PKG in $(echo $FAILED_BUILDS | sed "s# #\n#g" | grep "/$SUITE/$ARCH/" | cut -d "/" -f11 | cut -d "_" -f1) ; do echo "$PKG" ; done)
547
548
549
550
551
				# double check those builds actually failed
				TO_SCHEDULE=""
				for pkg in $CANDIDATES ; do
					QUERY="SELECT s.name FROM sources AS s JOIN results AS r ON r.package_id=s.id
						   WHERE s.suite='$SUITE' AND s.architecture='$ARCH' AND (r.status='FTBFS' OR r.status='depwait') AND s.name='$pkg'"
552
					TO_SCHEDULE=${TO_SCHEDULE:+"$TO_SCHEDULE "}$(query_db "$QUERY")
553
554
				done
				schedule_packages $TO_SCHEDULE
555
			done
556
		done
557
558
559
		DIRTY=true
	fi

560
561
562
563
564
565
566
	#
	# find failed builds due to diffoscope schroot problems and reschedule them
	#
	# only grep through the last 5h (300 minutes) of builds...
	# (ignore "*None.rbuild.log" because these are build which were just started)
	# this job runs every 4h
	echo "$(date -u) - Rescheduling failed builds due to diffoscope schroot issues."
567
	FAILED_BUILDS=$(find $DEBIAN_BASE/rbuild -type f ! -name "*None.rbuild.log" ! -mmin +300 -exec zgrep -l -F 'E: 10mount: error: Directory' {} \; 2>/dev/null|| true)
568
569
570
571
572
573
574
575
	if [ ! -z "$FAILED_BUILDS" ] ; then
		echo
		echo "Warning: The following builds have failed due to diffoscope schroot problems and will be rescheduled now:"
		echo "$FAILED_BUILDS"
		echo
		echo "Rescheduling packages: "
		REQUESTER="jenkins maintenance job"
		REASON="maintenance reschedule: reschedule builds which failed due to diffoscope schroot errors"
576
577
578
		for SUITE in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f9 | sort -u) ; do
			for ARCH in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f10 | sort -u) ; do
				CANDIDATES=$(echo $FAILED_BUILDS | sed "s# #\n#g" | grep "/$SUITE/$ARCH/" | cut -d "/" -f11 | cut -d "_" -f1 | xargs)
579
				if [ ! -z "$CANDIDATES" ]; then
580
					schedule_packages $CANDIDATES
581
				fi
582
583
584
585
586
			done
		done
		DIRTY=true
	fi

587
588
589
	#
	# find packages which build didnt end correctly
	#
Holger Levsen's avatar
Holger Levsen committed
590
	echo "$(date -u) - Rescheduling builds which didn't end correctly."
591
	DATE=$(date '+%Y-%m-%d %H:%M' -d "-2 days")
592
593
594
	QUERY="
		SELECT s.id, s.name, p.date_scheduled, p.date_build_started
			FROM schedule AS p JOIN sources AS s ON p.package_id=s.id
595
			WHERE p.date_build_started IS NOT NULL
596
			AND p.date_build_started < '$DATE'
597
598
599
			ORDER BY p.date_scheduled
		"
	PACKAGES=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXXX)
600
	query_db "$QUERY" > $PACKAGES 2> /dev/null || echo "Warning: SQL query '$QUERY' failed."
601
602
	if grep -q '|' $PACKAGES ; then
		echo
603
		echo "Packages found where the build was started more than 48h ago:"
604
		query_to_print "$QUERY" 2> /dev/null || echo "Warning: SQL query '$QUERY' failed."
605
606
		echo
		for PKG in $(cat $PACKAGES | cut -d "|" -f1) ; do
607
608
			echo "query_db \"UPDATE schedule SET date_build_started = NULL, job = NULL WHERE package_id = '$PKG';\""
			query_db "UPDATE schedule SET date_build_started = NULL, job = NULL WHERE package_id = '$PKG';"
609
		done
610
		echo "Packages have been rescheduled."
611
612
613
614
615
616
617
618
		echo
		DIRTY=true
	fi
	rm $PACKAGES

	#
	# find packages which have been removed from the archive
	#
Holger Levsen's avatar
Holger Levsen committed
619
	echo "$(date -u) - Looking for packages which have been removed from the archive."
620
621
622
	PACKAGES=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXX)
	QUERY="SELECT name, suite, architecture FROM removed_packages
			LIMIT 25"
623
	query_db "$QUERY" > $PACKAGES 2> /dev/null || echo "Warning: SQL query '$QUERY' failed."
624
625
626
627
628
	if grep -q '|' $PACKAGES ; then
		DIRTY=true
		echo
		echo "Found files relative to old packages, no more in the archive:"
		echo "Removing these removed packages from database:"
629
		query_to_print "$QUERY" 2> /dev/null || echo "Warning: SQL query '$QUERY' failed."
630
631
632
633
634
635
636
		echo
		for pkg in $(cat $PACKAGES) ; do
			PKGNAME=$(echo "$pkg" | cut -d '|' -f 1)
			SUITE=$(echo "$pkg" | cut -d '|' -f 2)
			ARCH=$(echo "$pkg" | cut -d '|' -f 3)
			QUERY="DELETE FROM removed_packages
				WHERE name='$PKGNAME' AND suite='$SUITE' AND architecture='$ARCH'"
637
			query_db "$QUERY"
638
			cd $DEBIAN_BASE
639
			find rb-pkg/$SUITE/$ARCH rbuild/$SUITE/$ARCH dbd/$SUITE/$ARCH dbdtxt/$SUITE/$ARCH buildinfo/$SUITE/$ARCH logs/$SUITE/$ARCH logdiffs/$SUITE/$ARCH -name "${PKGNAME}_*" 2>/dev/null | xargs -r rm -v || echo "Warning: couldn't delete old files from ${PKGNAME} in $SUITE/$ARCH"
640
641
642
643
644
645
		done
		cd - > /dev/null
	fi
	rm $PACKAGES

	#
646
	# delete jenkins html logs from reproducible_builder_(fedora|archlinux)* jobs as they are mostly redundant
647
648
	# (they only provide the extended value of parsed console output, which we dont need here.)
	#
649
	OLDSTUFF=$(find /var/lib/jenkins/jobs/reproducible_builder_* -maxdepth 3 -mtime +0 -name log_content.html  -exec rm -v {} \; 2>/dev/null | wc -l)
650
651
652
653
654
	if [ ! -z "$OLDSTUFF" ] ; then
		echo
		echo "Removed $OLDSTUFF jenkins html logs."
		echo
	fi
655

656
657
fi

658
# find+terminate processes which should not be there
Holger Levsen's avatar
Holger Levsen committed
659
echo "$(date -u) - Looking for processes which should not be there."
660
661
662
HAYSTACK=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
RESULT=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
TOKILL=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
663
PBUIDS="1234 1111 2222"
664
ps axo pid,user,size,pcpu,cmd > $HAYSTACK
665
for i in $PBUIDS ; do
666
	for PROCESS in $(pgrep -u $i -P 1 || true) ; do
667
		# faked-sysv comes and goes...
668
		grep ^$PROCESS $HAYSTACK | grep -v faked-sysv >> $RESULT 2> /dev/null || true
669
	done
670
671
done
if [ -s $RESULT ] ; then
672
	for PROCESS in $(cat $RESULT | cut -d " " -f1 | grep -v ^UID | xargs echo) ; do
673
		AGE=$(ps -p $PROCESS -o etimes= || echo 0)
674
675
		# a single build may take day, so... (first build: 18h, 2nd: 24h)
		if [ $AGE -gt $(( 24*60*60 )) ] ; then
676
			echo "$PROCESS" >> $TOKILL
677
678
		fi
	done
679
680
	if [ -s $TOKILL ] ; then
		DIRTY=true
681
		PSCALL=""
682
		echo
683
		echo "Info: processes found which should not be there, killing them now:"
684
685
686
687
		for PROCESS in $(cat $TOKILL) ; do
			PSCALL=${PSCALL:+"$PSCALL,"}"$PROCESS"
		done
		ps -F -p $PSCALL
688
		echo
689
		for PROCESS in $(cat $TOKILL) ; do
690
691
			sudo kill -9 $PROCESS 2>&1
			echo "'kill -9 $PROCESS' done."
692
693
694
		done
		echo
	fi
695
fi
696
rm $HAYSTACK $RESULT $TOKILL
697
698
# There are naughty processes spawning childs and leaving them to their grandparents
PSCALL=""
699
for i in $PBUIDS ; do
700
701
	for p in $(pgrep -u $i) ; do
		AGE=$(ps -p $p -o etimes= || echo 0)
702
703
		# let's be generous and consider 26 hours here...
		if [ $AGE -gt $(( 26*60*60 )) ] ; then
704
			sudo kill -9 $p 2>&1 || (echo "Could not kill:" ; ps -F -p "$p")
705
			sleep 2
706
707
708
709
710
			# check it's gone
			AGE=$(ps -p $p -o etimes= || echo 0)
			if [ $AGE -gt $(( 14*60*60 )) ] ; then
				PSCALL=${PSCALL:+"$PSCALL,"}"$p"
			fi
711
712
713
714
		fi
	done
done
if [ ! -z "$PSCALL" ] ; then
Holger Levsen's avatar
Holger Levsen committed
715
	echo -e "Warning: processes found which should not be there and which could not be killed. Please fix manually:"
716
717
718
	ps -F -p "$PSCALL"
	echo
fi
719

720
# find builds which should not be there
721
722
723
724
725
726
727
728
# (not on i386 as we start builds differently here… work in progress)
if [ "$ARCH" != "i386" ] ; then
	RESULTS=$(pgrep -f reproducible_build.sh --parent 1 || true)
	if [ ! -z "$RESULTS" ] ; then
		DIRTY=true
		echo "Warning: found reproducible_build.sh processes which have pid 1 as parent (and not sshd), thus something went wrong… please investigate."
		echo -e "$RESULTS"
	fi
729
fi
730

731
732
# remove artifacts older than a day
echo "$(date -u) - Checking for artifacts older than a day."
733
ARTIFACTS=$(find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
734
735
if [ ! -z "$ARTIFACTS" ] ; then
	echo
736
	echo "Removed old artifacts:"
737
	find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
738
739
740
	echo
fi

741
742
743
744
745
746
747
748
749
750
# remove artifacts from the debian live build jobs, older than a day
echo "$(date -u) - Checking for artifacts from debian live build jobs, that are older than a day."
ARTIFACTS=$(find $BASE/debian_live_build/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
if [ ! -z "$ARTIFACTS" ] ; then
	echo
	echo "Removed old artifacts:"
	find $BASE/debian_live_build/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
	echo
fi

751
# find + chmod files with bad permissions
Holger Levsen's avatar
Holger Levsen committed
752
echo "$(date -u) - Checking for files with bad permissions."
753
754
# automatically fix rbuild files with wrong permissions...
# (we know it happens (very rarely) but... shrugs.)
755
[ ! -d $DEBIAN_BASE/rbuild ] || find $DEBIAN_BASE/rbuild ! -perm 644 -type f -exec chmod -v 644 {} \; 2>/dev/null|| true
756
BADPERMS=$(find $DEBIAN_BASE/{buildinfo,dbd,artifacts,stretch,buster,bullseye,bookworm,unstable,experimental,rb-pkg} ! -perm 644 -type f 2>/dev/null|| true)
757
if [ ! -z "$BADPERMS" ] ; then
758
759
760
761
762
763
764
765
    DIRTY=true
    echo
    echo "Warning: Found files with bad permissions (!=644):"
    echo "Please fix permission manually"
    echo "$BADPERMS" | xargs echo chmod -v 644
    echo
fi

766
# daily mails
767
if [ "$HOSTNAME" = "$MAINNODE" ] && [ $(date -u +%H) -eq 0 ]  ; then
768
	# once a day, send mail about builder problems
769
	files_to_mail=(
770
771
		/var/log/jenkins/reproducible-builder-errors.log
		/var/log/jenkins/reproducible-stale-builds.log
772
		/var/log/jenkins/reproducible-archlinux-stale-builds.log
773
774
775
		/var/log/jenkins/reproducible-race-conditions.log
		/var/log/jenkins/reproducible-diskspace-issues.log
		/var/log/jenkins/reproducible-remote-error.log
776
		/var/log/jenkins/reproducible-scheduler.log
777
778
779
		/var/log/jenkins/reproducible-env-changes.log
		/var/log/jenkins/reproducible-submit2buildinfo.debian.net.log
		/var/log/postgresql/postgresql-9.6-main.log
780
781
	)
	for PROBLEM in "${files_to_mail[@]}" ; do
782
783
		if [ -s $PROBLEM ] ; then
			TMPFILE=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXXX)
784
			if [ "$(dirname $PROBLEM)" = "/var/log/jenkins" ] ; then
Holger Levsen's avatar
Holger Levsen committed
785
786
787
788
789
				if [ "$(basename $PROBLEM)" = "reproducible-diskspace-issues.log" ]; then
					echo "diskspace issues should always be investigated." > $TMPFILE
				fi
				if grep -q https $PROBLEM ; then
					echo "$(grep -c https $PROBLEM) entries found:"
790
					if [ "$(basename $PROBLEM)" != "reproducible-remote-error.log" ] && [ "$(basename $PROBLEM)" != "reproducible-race-conditions.log" ] ; then
Holger Levsen's avatar
Holger Levsen committed
791
792
793
794
						OTHERPROJECTS=""
					else
						OTHERPROJECTS="archlinux fedora"
					fi
795
					echo "$(grep -c https $PROBLEM || echo 0) entries found:" >> $TMPFILE
Holger Levsen's avatar
Holger Levsen committed
796
					for a in $ARCHS $OTHERPROJECTS; do
797
						echo "- $(grep https $PROBLEM|grep -c ${a}_) from $a." >> $TMPFILE
Holger Levsen's avatar
Holger Levsen committed
798
799
800
801
802
803
804
805
					done
				elif grep -q 'stale builds found' $PROBLEM ; then
					echo "$(grep -c 'stale builds found' $PROBLEM || echo 0) entries found:" >> $TMPFILE
					for a in $ARCHS ; do
							echo "- $(grep -c ${a}_ $PROBLEM) from $a." >> $TMPFILE
					done
				fi
				echo >> $TMPFILE
806
				# maybe we should use logrotate for our jenkins logs too…
Holger Levsen's avatar
Holger Levsen committed
807
808
				cat $PROBLEM >> $TMPFILE
				rm $PROBLEM
809
			else
810
811
				# regular logfile, logrotate is used (and the file ain't owned by jenkins)
				# only care for yesterday's entries:
Holger Levsen's avatar
Holger Levsen committed
812
813
				( grep $(date -u -d "1 day ago" '+%Y-%m-%d') $PROBLEM || echo "no problems yesterday…" ) > $TMPFILE
			fi
814
815
			# send mail if we found issues
			if [ -s $TMPFILE ] && ! grep -q "no problems yesterday…" $TMPFILE ; then
816
817
818
				if [ "$(basename $PROBLEM)" = "reproducible-submit2buildinfo.debian.net.log" ]; then
					CC="-c lamby@debian.org"
				fi
819
				cat $TMPFILE | mail -s "$(basename $PROBLEM) found" ${CC:-} qa-jenkins-scm@lists.alioth.debian.org
820
				CC=""
821
			fi
822
823
824
			rm -f $TMPFILE
		fi
	done
825
	# once a day, send notifications to package maintainers
Holger Levsen's avatar
Holger Levsen committed
826
	cd $REP_RESULTS/notification-emails
827
828
	for NOTE in $(find . -type f) ; do
			TMPFILE=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXXX)
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
			MAILBODY=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXXX)
			PKG=$(basename "$NOTE")
			mv "$NOTE" "$TMPFILE"  # prevent races while running this routing and building this package.
			cat > "$MAILBODY" <<- EOF
			Dear maintainer,

			The reproducibility status of the package $PKG changed during the
			continuous testing.
			See the following notes for more details:

			$(< "$TMPFILE")

			Feel free to reply to this email if you have questions regarding
			this automatic notification.

			-- 
			The Reproducible Builds folks
			EOF
			< "$MAILBODY" mail -s "$PKG: status change on tests.reproducible-builds.org/debian" \
				-a "From: Reproducible Builds folks <reproducible-builds@lists.alioth.debian.org>" \
849
				-a "X-Reproducible-Builds-Pkg: $PKG" \
850
851
				"$PKG@packages.debian.org"
			rm -f "$TMPFILE" "$MAILBODY"
852
	done
853
854
fi

855
if ! $DIRTY ; then
Holger Levsen's avatar
Holger Levsen committed
856
	echo "$(date -u ) - Everything seems to be fine."
857
858
	echo
fi
859

860
echo "$(date -u) - the end."