Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id: /local/openfoam/Python/PyFoam/PyFoam/Infrastructure/ClusterJob.py 8451 2013-09-24T19:03:11.513979Z bgschaid  $ 
  2  """Encapsulates all necessary things for a cluster-job, like setting 
  3  up, running, restarting""" 
  4   
  5  import os,sys,subprocess 
  6  from os import path,unlink 
  7  from threading import Thread,Lock,Timer 
  8   
  9  from PyFoam.Applications.Decomposer import Decomposer 
 10  from PyFoam.Applications.Runner import Runner 
 11  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 12  from PyFoam.Applications.CloneCase import CloneCase 
 13  from PyFoam.Applications.FromTemplate import FromTemplate 
 14   
 15  from PyFoam.FoamInformation import changeFoamVersion 
 16  from PyFoam.FoamInformation import foamVersion as getFoamVersion 
 17  from PyFoam.Error import error,warning 
 18  from PyFoam import configuration as config 
 19  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 20  from PyFoam.RunDictionary.SolutionDirectory import SolutionDirectory 
 21   
 22  from PyFoam.ThirdParty.six import print_,iteritems 
 23   
24 -def checkForMessageFromAbove(job):
25 if not job.listenToTimer: 26 return 27 28 if path.exists(job.stopFile()): 29 job.stopJob() 30 return 31 32 if path.exists(job.checkpointFile()): 33 job.writeCheckpoint() 34 35 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 36 job.timer.start()
37 38
39 -class ClusterJob(object):
40 """ All Cluster-jobs are to be derived from this base-class 41 42 The actual jobs are implemented by overriding methods 43 44 There is a number of variables in this class that are used to 45 'communicate' information between the various stages""" 46
47 - def __init__(self, 48 basename, 49 arrayJob=False, 50 hardRestart=False, 51 autoParallel=True, 52 doAutoReconstruct=None, 53 foamVersion=None, 54 compileOption=None, 55 useFoamMPI=False, 56 multiRegion=False, 57 parameters={}, 58 isDecomposed=False):
59 """Initializes the Job 60 @param basename: Basis name of the job 61 @param arrayJob: this job is a parameter variation. The tasks 62 are identified by their task-id 63 @param hardRestart: treat the job as restarted 64 @param autoParallel: Parallelization is handled by the base-class 65 @param doAutoReconstruct: Automatically reconstruct the case if 66 autoParalellel is set. If the value is None then it is looked up from 67 the configuration 68 @param foamVersion: The foam-Version that is to be used 69 @param compileOption: Forces compile-option (usually 'Opt' or 'Debug') 70 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 71 @param multiRegion: This job consists of multiple regions 72 @param parameters: Dictionary with parameters that are being passed to the Runner 73 @param isDecomposed: Assume that the job is already decomposed""" 74 75 # print_(os.environ) 76 77 if not "JOB_ID" in os.environ: 78 error("Not an SGE-job. Environment variable JOB_ID is missing") 79 self.jobID=int(os.environ["JOB_ID"]) 80 self.jobName=os.environ["JOB_NAME"] 81 82 self.basename=path.join(path.abspath(path.curdir),basename) 83 84 sgeRestarted=False 85 if "RESTARTED" in os.environ: 86 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 87 88 if sgeRestarted or hardRestart: 89 self.restarted=True 90 else: 91 self.restarted=False 92 93 if foamVersion==None: 94 foamVersion=config().get("OpenFOAM","Version") 95 96 changeFoamVersion(foamVersion,compileOption=compileOption) 97 98 if not "WM_PROJECT_VERSION" in os.environ: 99 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 100 101 self.autoParallel=autoParallel 102 103 self.doAutoReconstruct=doAutoReconstruct 104 if self.doAutoReconstruct==None: 105 self.doAutoReconstruct=config().getboolean("ClusterJob","doAutoReconstruct") 106 107 self.multiRegion=multiRegion 108 109 self.parameters=parameters 110 111 self.hostfile=None 112 self.nproc=1 113 114 if "NSLOTS" in os.environ: 115 self.nproc=int(os.environ["NSLOTS"]) 116 self.message("Running on",self.nproc,"CPUs") 117 if self.nproc>1: 118 # self.hostfile=os.environ["PE_HOSTFILE"] 119 self.hostfile=path.join(os.environ["TMP"],"machines") 120 self.message("Using the machinefile",self.hostfile) 121 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 122 123 self.ordinaryEnd=True 124 self.listenToTimer=False 125 126 self.taskID=None 127 self.arrayJob=arrayJob 128 129 if self.arrayJob: 130 self.taskID=int(os.environ["SGE_TASK_ID"]) 131 132 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 133 ## prepend special paths for the cluster 134 self.message("Adding Cluster-specific paths") 135 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 136 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 137 138 self.isDecomposed=isDecomposed
139
140 - def fullJobId(self):
141 """Return a string with the full job-ID""" 142 result=str(self.jobID) 143 if self.arrayJob: 144 result+=":"+str(self.taskID) 145 return result
146
147 - def message(self,*txt):
148 print_("=== CLUSTERJOB: ",end="") 149 for t in txt: 150 print_(t,end=" ") 151 print_(" ===") 152 sys.stdout.flush()
153
154 - def setState(self,txt):
155 self.message("Setting Job state to",txt) 156 fName=path.join(self.casedir(),"ClusterJobState") 157 f=open(fName,"w") 158 f.write(txt+"\n") 159 f.close()
160
161 - def jobFile(self):
162 """The file with the job information""" 163 jobfile="%s.%d" % (self.jobName,self.jobID) 164 if self.arrayJob: 165 jobfile+=".%d" % self.taskID 166 jobfile+=".pyFoam.clusterjob" 167 jobfile=path.join(path.dirname(self.basename),jobfile) 168 169 return jobfile
170
171 - def checkpointFile(self):
172 """The file that makes the job write a checkpoint""" 173 return self.jobFile()+".checkpoint"
174
175 - def stopFile(self):
176 """The file that makes the job write a checkpoint and end""" 177 return self.jobFile()+".stop"
178
179 - def doIt(self):
180 """The central logic. Runs the job, sets it up etc""" 181 182 f=open(self.jobFile(),"w") 183 f.write(path.basename(self.basename)+"\n") 184 f.close() 185 186 self.message() 187 self.message("Running on directory",self.casename()) 188 self.message() 189 self.setState("Starting up") 190 191 if self.arrayJob: 192 for k,v in list(self.taskParameters(self.taskID).items()): 193 self.parameters[k]=v 194 195 self.parameters.update(self.additionalParameters()) 196 197 self.message("Parameters:",self.parameters) 198 if not self.restarted: 199 self.setState("Setting up") 200 self.setup(self.parameters) 201 if self.autoParallel and self.nproc>1 and not self.isDecomposed: 202 self.setState("Decomposing") 203 self.autoDecompose() 204 205 self.isDecomposed=True 206 207 self.setState("Setting up 2") 208 self.postDecomposeSetup(self.parameters) 209 else: 210 self.setState("Restarting") 211 212 self.isDecomposed=True 213 214 self.setState("Running") 215 self.listenToTimer=True 216 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 217 self.timer.start() 218 219 self.run(self.parameters) 220 self.listenToTimer=False 221 222 if path.exists(self.jobFile()): 223 unlink(self.jobFile()) 224 225 if self.ordinaryEnd: 226 self.setState("Post Running") 227 self.preReconstructCleanup(self.parameters) 228 229 if self.autoParallel and self.nproc>1: 230 self.setState("Reconstructing") 231 self.autoReconstruct() 232 233 if self.nproc>0: 234 self.additionalReconstruct(self.parameters) 235 236 self.setState("Cleaning") 237 self.cleanup(self.parameters) 238 self.setState("Finished") 239 else: 240 self.setState("Suspended") 241 242 if path.exists(self.stopFile()): 243 unlink(self.stopFile()) 244 if path.exists(self.checkpointFile()): 245 unlink(self.checkpointFile())
246
247 - def casedir(self):
248 """Returns the actual directory of the case 249 To be overridden if appropriate""" 250 if self.arrayJob: 251 return "%s.%05d" % (self.basename,self.taskID) 252 else: 253 return self.basename
254
255 - def casename(self):
256 """Returns just the name of the case""" 257 return path.basename(self.casedir())
258
259 - def execute(self,cmd):
260 """Execute a shell command in the case directory. No checking done 261 @param cmd: the command as a string""" 262 oldDir=os.getcwd() 263 self.message("Changing directory to",self.casedir()) 264 os.chdir(self.casedir()) 265 self.message("Executing",cmd) 266 try: 267 retcode = subprocess.call(cmd,shell=True) 268 if retcode < 0: 269 self.message(cmd,"was terminated by signal", -retcode) 270 else: 271 self.message(cmd,"returned", retcode) 272 except OSError: 273 e = sys.exc_info()[1] # Needed because python 2.5 does not support 'as e' 274 self.message(cmd,"Execution failed:", e) 275 276 self.message("Executiong of",cmd,"ended") 277 self.message("Changing directory back to",oldDir) 278 os.chdir(oldDir)
279
280 - def templateFile(self,fileName):
281 """Looks for a template file and evaluates the template using 282 the usual parameters 283 @param fileName: the name of the file that will be 284 constructed. The template file is the same plus the extension '.template'""" 285 286 self.message("Building file",fileName,"from template with parameters", 287 self.parameters) 288 289 argList=["--output-file=%s" % path.join(self.casedir(),fileName), 290 "--dump-used-values" 291 ] 292 293 tmpl=FromTemplate(args=argList, 294 parameters=self.parameters)
295
296 - def foamRun(self,application, 297 args=[], 298 foamArgs=[], 299 steady=False, 300 multiRegion=None, 301 progress=False, 302 compress=False, 303 noLog=False):
304 """Runs a foam utility on the case. 305 If it is a parallel job and the grid has 306 already been decomposed (and not yet reconstructed) it is run in 307 parallel 308 @param application: the Foam-Application that is to be run 309 @param foamArgs: A list if with the additional arguments for the 310 Foam-Application 311 @param compress: Compress the log-file 312 @param args: A list with additional arguments for the Runner-object 313 @param steady: Use the steady-runner 314 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 315 @param progress: Only output the time and nothing else 316 @param noLog: Do not generate a logfile""" 317 318 arglist=args[:] 319 arglist+=["--job-id=%s" % self.fullJobId()] 320 for k,v in iteritems(self.parameters): 321 arglist+=["--parameter=%s:%s" % (str(k),str(v))] 322 323 if self.isDecomposed and self.nproc>1: 324 arglist+=["--procnr=%d" % self.nproc, 325 "--machinefile=%s" % self.hostfile] 326 327 if progress: 328 arglist+=["--progress"] 329 if noLog: 330 arglist+=["--no-log"] 331 if compress: 332 arglist+=["--compress"] 333 334 if self.multiRegion: 335 if multiRegion==None or multiRegion==True: 336 arglist+=["--all-regions"] 337 elif multiRegion and not self.multiRegion: 338 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 339 340 if self.restarted: 341 arglist+=["--restart"] 342 343 arglist+=[application] 344 if oldApp(): 345 arglist+=[".",self.casename()] 346 else: 347 arglist+=["-case",self.casename()] 348 349 arglist+=foamArgs 350 351 self.message("Executing",arglist) 352 353 if steady: 354 self.message("Running Steady") 355 runner=SteadyRunner(args=arglist) 356 else: 357 runner=Runner(args=arglist)
358
359 - def autoDecompose(self):
360 """Automatically decomposes the grid with a metis-algorithm""" 361 362 if path.isdir(path.join(self.casedir(),"processor0")): 363 warning("A processor directory already exists. There might be a problem") 364 365 defaultMethod="metis" 366 367 if getFoamVersion()>=(1,6): 368 defaultMethod="scotch" 369 370 args=["--method="+defaultMethod, 371 "--clear", 372 self.casename(), 373 self.nproc, 374 "--job-id=%s" % self.fullJobId()] 375 376 if self.multiRegion: 377 args.append("--all-regions") 378 379 deco=Decomposer(args=args)
380
381 - def autoReconstruct(self):
382 """Default reconstruction of a parallel run""" 383 384 if self.doAutoReconstruct: 385 self.isDecomposed=False 386 387 self.foamRun("reconstructPar", 388 args=["--logname=ReconstructPar"]) 389 else: 390 self.message("No reconstruction (because asked to)")
391
392 - def setup(self,parameters):
393 """Set up the job. Called in the beginning if the 394 job has not been restarted 395 396 Usual tasks include grid conversion/setup, mesh decomposition etc 397 398 @param parameters: a dictionary with parameters""" 399 400 pass
401
402 - def postDecomposeSetup(self,parameters):
403 """Additional setup, to be executed when the grid is already decomposed 404 405 Usually for tasks that can be done on a decomposed grid 406 407 @param parameters: a dictionary with parameters""" 408 409 pass
410
411 - def run(self,parameters):
412 """Run the actual job. Usually the solver. 413 @param parameters: a dictionary with parameters""" 414 415 pass
416
417 - def preReconstructCleanup(self,parameters):
418 """Additional cleanup, to be executed when the grid is still decomposed 419 420 Usually for tasks that can be done on a decomposed grid 421 422 @param parameters: a dictionary with parameters""" 423 424 pass
425
426 - def cleanup(self,parameters):
427 """Clean up after a job 428 @param parameters: a dictionary with parameters""" 429 430 pass
431
432 - def additionalReconstruct(self,parameters):
433 """Additional reconstruction of parallel runs (Stuff that the 434 OpenFOAM-reconstructPar doesn't do 435 @param parameters: a dictionary with parameters""" 436 437 pass
438
439 - def taskParameters(self,id):
440 """Parameters for a specific task 441 @param id: the id of the task 442 @return: a dictionary with parameters for this task""" 443 444 error("taskParameter not implemented. Not a parameterized job") 445 446 return {}
447
448 - def additionalParameters(self):
449 """Additional parameters 450 @return: a dictionary with parameters for this task""" 451 452 warning("Method 'additionalParameters' not implemented. Not a problem. Just saying") 453 454 return {}
455
456 - def writeCheckpoint(self):
457 if self.listenToTimer: 458 f=open(path.join(self.basename,"write"),"w") 459 f.write("Jetzt will ich's wissen") 460 f.close() 461 unlink(self.checkpointFile()) 462 else: 463 warning("I'm not listening to your callbacks") 464 465 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
466
467 - def stopJob(self):
468 if self.listenToTimer: 469 self.ordinaryEnd=False 470 f=open(path.join(self.basename,"stop"),"w") 471 f.write("Geh z'haus") 472 f.close() 473 unlink(self.stopFile()) 474 else: 475 warning("I'm not listening to your callbacks")
476
477 -class SolverJob(ClusterJob):
478 """A Cluster-Job that executes a solver. It implements the run-function. 479 If a template-case is specified, the case is copied""" 480
481 - def __init__(self,basename,solver, 482 template=None, 483 cloneParameters=[], 484 arrayJob=False, 485 hardRestart=False, 486 autoParallel=True, 487 doAutoReconstruct=None, 488 foamVersion=None, 489 compileOption=None, 490 useFoamMPI=False, 491 steady=False, 492 multiRegion=False, 493 parameters={}, 494 progress=False, 495 solverProgress=False, 496 solverNoLog=False, 497 solverLogCompress=False, 498 isDecomposed=False):
499 """@param template: Name of the template-case. It is assumed that 500 it resides in the same directory as the actual case 501 @param cloneParameters: a list with additional parameters for the 502 CloneCase-object that copies the template 503 @param solverProgress: Only writes the current time of the solver""" 504 505 ClusterJob.__init__(self,basename, 506 arrayJob=arrayJob, 507 hardRestart=hardRestart, 508 autoParallel=autoParallel, 509 doAutoReconstruct=doAutoReconstruct, 510 foamVersion=foamVersion, 511 compileOption=compileOption, 512 useFoamMPI=useFoamMPI, 513 multiRegion=multiRegion, 514 parameters=parameters, 515 isDecomposed=isDecomposed) 516 self.solver=solver 517 self.steady=steady 518 if template!=None and not self.restarted: 519 template=path.join(path.dirname(self.casedir()),template) 520 if path.abspath(basename)==path.abspath(template): 521 error("The basename",basename,"and the template",template,"are the same directory") 522 if isDecomposed: 523 cloneParameters+=["--parallel"] 524 clone=CloneCase( 525 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 526 self.solverProgress=solverProgress 527 self.solverNoLog=solverNoLog 528 self.solverLogCompress=solverLogCompress
529
530 - def run(self,parameters):
531 self.foamRun(self.solver, 532 steady=self.steady, 533 multiRegion=False, 534 progress=self.solverProgress, 535 noLog=self.solverNoLog, 536 compress=self.solverLogCompress)
537 538 # Should work with Python3 and Python2 539