LIGO Support Ticket 17291
Ticket Information
Number: admin 17291
User: anderson@ligo.caltech.edu
Email: skoranda__AT__gravity.phys.uwm.edu
Status: new
Assigned To: tannenba
Date: Mon, 10 Dec 2007 10:25:13 -0800
From: Stuart Anderson <anderson__AT__ligo.caltech.edu>
To: condor-admin__AT__cs.wisc.edu
CC: Scott Koranda <skoranda__AT__gravity.phys.uwm.edu>
Subject: LIGO: job on hold without a reason
X-Seen-BY: mailfromd 4.1 granite.cs.wisc.edu
X-MIME-Autoconverted: from quoted-printable to 8bit by chopin.cs.wisc.edu
id lBAIPN0M019841
The LIGO Condor pool at Caltech running,
# condor_version
$CondorVersion: 6.9.5 Nov 28 2007 BuildID: 65347 $
$CondorPlatform: X86_64-LINUX_RHEL3 $
has some jobs in the queue that are on hold without any specified reason, i.e.,
"JobStatus = 5" but none of the "Hold*" attributes have been set. Assuming
this is a bug rather than a feature here is some more information.
Note, Quill is not enabled on this pool so all information is direct from
the Schedd.
# condor_q -long 785373.0 | egrep "JobStatus|Hold"
NumSystemHolds = 0
PeriodicHold = FALSE
OnExitHold = FALSE
JobStatus = 5
for comparison here is "normal" job on hold by the same user on the same
submit machine,
# condor_q -long 785883.0 | egrep "JobStatus|Hold"
NumSystemHolds = 0
PeriodicHold = FALSE
OnExitHold = FALSE
HoldReason = "Error from starter on slot1__AT__node222.ldas-cit.ligo.caltech.edu: Failed to execute '/mnt/zfs/isogait/qscan_pack/.condor_run.28274': No such file or directory"
HoldReasonCode = 6
HoldReasonSubCode = 2
JobStatus = 5
Here is the full job ClassAd for one of the problem jobs,
# condor_q -long 785373.0
-- Submitter: ldas-pcdev1.ligo.caltech.edu : <10.14.0.18:36304> : ldas-pcdev1.ligo.caltech.edu
MyType = "Job"
TargetType = "Machine"
ClusterId = 785373
QDate = 1197176084
CompletionDate = 0
Owner = "isogait"
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts_RAW = 0
NumCkpts = 0
NumJobStarts = 0
NumRestarts = 0
NumSystemHolds = 0
CommittedTime = 0
TotalSuspensions = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
Notification = ERROR
WantBadgers = TRUE
JOB_LEASE_DURATION = 3600
copy_to_spool = TRUE
CondorVersion = "$CondorVersion: 6.9.5 Nov 28 2007 BuildID: 65347 $"
CondorPlatform = "$CondorPlatform: X86_64-LINUX_RHEL3 $"
RootDir = "/"
Iwd = "/mnt/zfs/isogait/qscan_pack"
JobUniverse = 5
Cmd = "/mnt/zfs/isogait/qscan_pack/.condor_run.14048"
MinHosts = 1
MaxHosts = 1
WantRemoteSyscalls = FALSE
WantCheckpoint = FALSE
JobPrio = 0
User = "isogait@ligo"
NiceUser = FALSE
Environment = "CLASSPATH=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/pegasus.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/xmlParserAPIs.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/xmlrpc.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/cryptix32.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/xercesImpl.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/preservcsl.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/resolver.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/exist.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/mysql-connector-java-5.0.5-bin.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/cryptix.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/cog-jglobus.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/postgresql-8.1dev-400.jdbc3.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/exist-optional.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/junit.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/accessors.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/jce-jdk13-125.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/commons-logging.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/cryptix-asn1.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/log4j-1.2.8.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/puretls.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/java-getopt-1.0.9.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/commons-pool.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/xmldb.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/jakarta-oro.jar:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/globus_rls_client.jar SHLIB_PATH=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/lib:/ldcg/ldg/vdt/globus/lib:/ldcg/ldg/vdt/globus/lib NLSPATH=/usr/dt/lib/nls/msg/%L/%N.cat GLOBUS_OPTIONS=-Xmx512M PAC_ANCHOR=/ldcg/stow_pkgs/ldg-4.5/ldg ARCH=glnxa64 AUTOMOUNT_MAP= SHLVL=2 LS_COLORS=no=00:fi=00:di=00;34:ln=00;36:pi=40;33:so=00;35:bd=40;33;01:cd=40;33;01:or=01;05;37;41:mi=01;05;37;41:ex=00;32:*.cmd=00;32:*.exe=00;32:*.com=00;32:*.btm=00;32:*.bat=00;32:*.sh=00;32:*.csh=00;32:*.tar=00;31:*.tgz=00;31:*.arj=00;31:*.taz=00;31:*.lzh=00;31:*.zip=00;31:*.z=00;31:*.Z=00;31:*.gz=00;31:*.bz2=00;31:*.bz=00;31:*.tz=00;31:*.rpm=00;31:*.cpio=00;31:*.jpg=00;35:*.gif=00;35:*.bmp=00;35:*.xbm=00;35:*.xpm=00;35:*.png=00;35:*.tif=00;35: PWD=/archive/home/isogait/qscan_pack LSC_DATAGRID_SERVER_LOCATION=/ldcg/ldg GRID_SECURITY_DIR=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/etc SSH_AUTH_SOCK=/tmp/ssh-pkgyt30020/agent.30020 VDT_LOCATION=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt SSH_CLIENT=137.22.6.77' '37347' '22 VDT_POSTINSTALL_README=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/post-install/README XFILESEARCHPATH=/ldcg/matlab_r2007a/sys/java/jre/glnxa64/jre1.5.0/lib/locale/%L/%T/%N%S::/usr/dt/app-defaults/%L/Dt PATH=/opt/lscsoft/lalapps/bin:/opt/lscsoft/lal/bin:/opt/lscsoft/glue/bin:/opt/lscsoft/libframe/bin:/opt/lscsoft/libmetaio/bin:/opt/lscsoft/framecpp/bin:/opt/lscsoft/dol/bin:/opt/lscsoft/root/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/apache/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/ant/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/sbin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pyglobus-url-copy/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/unixodbc/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/mysql/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/edg/sbin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/jdk1.5/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/logrotate/sbin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/gpt/sbin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/sbin:/ldcg/pacman/stow_pkgs/pacman-3.21/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/vdt/sbin:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/vdt/bin:/ldcg/stow_pkgs/ldg-4.5/ldg/ldg-server/bin:/usr/kerberos/bin:/usr/bin:/bin:/usr/sbin:/sbin:/ldcg/ldg/vdt/globus/bin:/usr/X11R6/bin:/ligotools/bin:/ldcg/matlab_r2007a/bin:. BASEMATLABPATH=/ligotools/matlab SASL_PATH=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/lib/sasl VDT_INSTALL_LOG=vdt-install.log GLITE_LOCATION_LOG=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/log ROOTSYS=/opt/lscsoft/root DYLD_LIBRARY_PATH=/opt/lscsoft/lal/lib64:/opt/lscsoft/glue/lib64/python2.4/site-packages:/opt/lscsoft/framecpp/lib64:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/lib GLOBUS_TCP_PORT_RANGE=40000,45000 GLOBUS_PATH=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus X509_CERT_DIR=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/TRUSTED_CA TOOLBOX=/ldcg/matlab_r2007a/toolbox LAL_PREFIX=/opt/lscsoft/lal OSG_LD_LIBRARY_PATH=/ldcg/matlab_r2007a/sys/openscenegraph/lib/glnxa64 VOMS_USERCONF=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/etc/vomses LDG_SOFTWARE_LOCATION=http://www.ldas-sw.ligo.caltech.edu/ldg_dist/ldg4.5/software INPUTRC=/etc/inputrc LSCSOFT_PREFIX=/opt/lscsoft ROOT_LOCATION=/opt/lscsoft/root PKG_CONFIG_PATH=/opt/lscsoft/lal/lib64/pkgconfig:/opt/lscsoft/libframe/lib64/pkgconfig:/opt/lscsoft/libmetaio/lib64/pkgconfig:/opt/lscsoft/framecpp/lib64/pkgconfig:/opt/lscsoft/dol/lib64/pkgconfig:/opt/lscsoft/root/lib64/pkgconfig: KDEDIR=/usr GLITE_LOCATION_TMP=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/tmp LIGOTOOLS=/ligotools GLITE_LOCATION=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite SSH_TTY=/dev/pts/40 GLITE_LOCATION_VAR=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/var LIBPATH=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/lib:/ldcg/ldg/vdt/globus/lib:/ldcg/ldg/vdt/globus/lib:/usr/lib:/lib SHELL=/bin/bash LDG_INSTALL_LOG=/ldcg/stow_pkgs/ldg-4.5/ldg/ldg-server/etc/ldg-install.log FRAMECPP_PREFIX=/opt/lscsoft/framecpp LDG_DIRECTORY=/ldcg/stow_pkgs/ldg-4.5/ldg/ldg-server MAIL=/var/spool/mail/isogait MANPATH=/opt/lscsoft/lalapps/share/man:/opt/lscsoft/lal/share/man:/opt/lscsoft/libframe/man:/opt/lscsoft/libmetaio/man:/opt/lscsoft/framecpp/share/man:/opt/lscsoft/root/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/man::/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/vdt/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/perl/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/expat/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/logrotate/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/jdk1.5/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/edg/share/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/mysql/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/share/man:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/apache/man GLUE_PREFIX=/opt/lscsoft/glue MYSQL_UNIX_PORT=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/vdt-app-data/mysql/var/mysql.sock DISPLAY=localhost:44.0 GLUE_LOCATION=/opt/lscsoft/glue PERL5LIB=/ldcg/stow_pkgs/ldg-4.5/ldg/ldg-server/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus/lib/perl:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/vdt/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/perl/lib/5.8.0:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/perl/lib/5.8.0/x86_64-linux-thread-multi:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/perl/lib/site_perl/5.8.0:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/perl/lib/site_perl/5.8.0/x86_64-linux-thread-multi: ANT_HOME=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/ant USER=isogait SSH_CONNECTION=137.22.6.77' '37347' '131.215.115.249' '22 DOL_LOCATION=/opt/lscsoft/dol HOSTNAME=ldas-pcdev1 LD_LIBRARY_PATH=/ldcg/matlab_r2007a/sys/os/glnxa64:/ldcg/matlab_r2007a/bin/glnxa64:/ldcg/matlab_r2007a/extern/lib/glnxa64:/ldcg/matlab_r2007a/sys/java/jre/glnxa64/jre1.5.0/lib/amd64/native_threads:/ldcg/matlab_r2007a/sys/java/jre/glnxa64/jre1.5.0/lib/amd64/server:/ldcg/matlab_r2007a/sys/java/jre/glnxa64/jre1.5.0/lib/amd64:/opt/lscsoft/lal/lib64:/opt/lscsoft/glue/lib64/python2.4/site-packages:/opt/lscsoft/libframe/lib64:/opt/lscsoft/libmetaio/lib64:/opt/lscsoft/framecpp/lib64:/opt/lscsoft/dol/lib64:/opt/lscsoft/root/lib64:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/tclglobus/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/apache/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/myodbc/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/unixodbc/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/mysql/lib/mysql:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/jdk1.5/jre/lib/i386:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/jdk1.5/jre/lib/i386/server:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/jdk1.5/jre/lib/i386/client:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/berkeley-db/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/expat/lib:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/lib:/ldcg/ldg/vdt/globus/lib:/ldcg/ldg/vdt/globus/lib:/ligotools/lib PYTHONPATH=/opt/lscsoft/lalapps/lib64/python2.4/site-packages:/opt/lscsoft/lalapps/lib/python2.4/site-packages:/opt/lscsoft/glue/lib64/python2.4/site-packages:/opt/lscsoft/glue/lib/python2.4/site-packages:/opt/lscsoft/libframe/lib64/python:/opt/lscsoft/libmetaio/lib64/python:/ldcg/stow_pkgs/ldg-4.5/ldg/ldg-server/lib64/python:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/lib64/python:/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/lib/python: X509_CADIR=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus/TRUSTED_CA ODBCINI=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/unixodbc/etc/odbc.ini CATALINA_OPTS=-Dorg.globus.wsrf.container.persistence.dir=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/vdt-app-data/globus/persisted HOME=/archive/home/isogait LAL_LOCATION=/opt/lscsoft/lalapps LOGNAME=isogait EDG_LOCATION=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/edg MATLABPATH=/ligotools/matlab:/ldcg/matlab_r2007a/toolbox/local GPT_LOCATION=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/gpt _=/usr/bin/condor_run GLOBUS_ERROR_VERBOSE=true JAVA_HOME=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/jdk1.5 G_BROKEN_FILENAMES=1 FRAMECPP_LOCATION=/opt/lscsoft/framecpp LANG=C GLOBUS_LOCATION=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/globus CONDOR_CONFIG=/usr1/condor/condor_config HISTSIZE=1000 LSC_SEGFIND_SERVER=ldas-cit.ligo.caltech.edu XKEYSYMDB=/ldcg/matlab_r2007a/X11/app-defaults/XKeysymDB GLOBUS_MYSQL_PATH=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/mysql MATLAB=/ldcg/matlab_r2007a PACMAN_LOCATION=/ldcg/pacman/stow_pkgs/pacman-3.21 PEGASUS_HOME=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/pegasus DAGDBUPDATORLOCKFILE=/etc/onasys-dblockfile X509_VOMS_DIR=/ldcg/stow_pkgs/ldg-4.5/ldg/vdt/glite/vomsdir CONDOR_LOCATION=/usr XAPPLRESDIR=/ldcg/matlab_r2007a/X11/app-defaults LDG_LOCATION=/ldcg/stow_pkgs/ldg-4.5/ldg TERM=dumb LESSOPEN=|/usr/bin/lesspipe.sh' '%s X509_USER_PROXY=/tmp/x509up_p30020.fileM6OH4Y.1 BOSSDIR=/etc LSC_DATAFIND_SERVER=ldas-cit.ligo.caltech.edu"
JobNotification = 0
WantRemoteIO = FALSE
UserLog = "/mnt/zfs/isogait/qscan_pack/.condor_log.14048"
CoreSize = 0
KillSig = "SIGTERM"
Rank = 0.000000
In = "/dev/null"
TransferIn = FALSE
Out = ".condor_out.14048"
StreamOut = FALSE
Err = ".condor_error.14048"
StreamErr = FALSE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "NO"
TransferFiles = "NEVER"
ImageSize_RAW = 1
ImageSize = 1
ExecutableSize_RAW = 1
ExecutableSize = 1
DiskUsage_RAW = 1
DiskUsage = 1
Requirements = (Arch == "X86_64") && (OpSys == "LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && (TARGET.FileSystemDomain == MY.FileSystemDomain)
FileSystemDomain = "ligo"
JobLeaseDuration = 3600
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
OnExitHold = FALSE
OnExitRemove = TRUE
LeaveJobInQueue = FALSE
Arguments = ""
GlobalJobId = "ldas-pcdev1.ligo.caltech.edu#1197176635#785373.0"
ProcId = 0
AutoClusterId = 3
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,JobStart,DiskUsage,ImageSize,FileSystemDomain,Requirements,NiceUser"
WantMatchDiagnostics = TRUE
LastMatchTime = 1197179443
NumJobMatches = 1
OrigMaxHosts = 1
JobStartDate = 1197179544
JobCurrentStartDate = 1197179544
NumShadowStarts = 1
JobRunCount = 1
LastJobLeaseRenewal = 1197179892
JobStatus = 5
EnteredCurrentStatus = 1197180136
LastSuspensionTime = 0
RemoteWallClockTime = 592.000000
LastRemoteHost = "slot4__AT__node130.ldas-cit.ligo.caltech.edu"
LastPublicClaimId = "<10.14.1.130:43765>#1196536907#1553#..."
LastPublicClaimIds = ""
CurrentHosts = 0
ServerTime = 1197310795
--
Stuart Anderson anderson__AT__ligo.caltech.edu
http://www.ligo.caltech.edu/~anderson
===========================================================================
Date of creation: Mon Dec 10 12:25:32 2007 (1197311135)
Subject: Actions
Assigned to tannenba by tlmiller
===========================================================================
Date of actions: Tue Dec 11 15:48:09 2007 (1197409689)