Changeset 1600 for trunk/src/lib/Components/qsim.py
- Timestamp:
- 07/02/09 13:17:06 (9 months ago)
- Files:
-
- 1 modified
-
trunk/src/lib/Components/qsim.py (modified) (15 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/lib/Components/qsim.py
r1599 r1600 146 146 end_time: unix second, float 147 147 failure_time: unix second, float 148 location: 'partition:partition', string148 location: list of string(partition name) 149 149 state: ['invisible', 'running', 'queued', 'ended', 'pending'] string 150 150 is_visible: true/false … … 201 201 self.recovery_opt = spec.get("recovery_opt", RECOVERYOPT) 202 202 self.checkpoint = 1 203 self.location = ''203 self.location = [] 204 204 205 205 class JobList(DataList): … … 332 332 333 333 #initialize time stamps and job queues 334 #time stamp format: ('EVENT', 'time_stamp_date', time_stamp_second, {'job_id':str(jobid), 'location': 'partition-name'})334 #time stamp format: ('EVENT', 'time_stamp_date', time_stamp_second, {'job_id':str(jobid), 'location':[partition1, partition2,...]}) 335 335 self.time_stamps = [('I', '0', 0, {})] 336 336 self.cur_time_index = 0 … … 536 536 message = "%s;Q;%d;queue=%s" % (timestamp, spec['jobid'], spec['queue']) 537 537 elif eventtype == 'R': #resume running after failure recovery 538 message = "%s;R;%s" % (timestamp, spec['location'])538 message = "%s;R;%s" % (timestamp, ":".join(spec['location'])) 539 539 else: 540 540 wall_time = spec['walltime'] … … 545 545 message = "%s;S;%d;queue=%s qtime=%s Resource_List.ncpus=%s Resource_List.walltime=%s start=%s exec_host=%s" % \ 546 546 (timestamp, spec['jobid'], spec['queue'], spec['submittime'], 547 spec['nodes'], log_walltime, spec['start_time'], spec['location'])547 spec['nodes'], log_walltime, spec['start_time'], ":".join(spec['location'])) 548 548 elif eventtype == 'E': #end 549 549 message = "%s;E;%d;queue=%s qtime=%s Resource_List.ncpus=%s Resource_List.walltime=%s start=%s end=%f exec_host=%s runtime=%s" % \ 550 550 (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, spec['start_time'], 551 round(float(spec['end_time']), 1), spec['location'],551 round(float(spec['end_time']), 1), ":".join(spec['location']), 552 552 spec['runtime']) 553 553 elif eventtype == 'F': #failure … … 555 555 message = "%s;F;%d;queue=%s qtime=%s Resource_List.ncpus=%s Resource_List.walltime=%s exec_host=%s start=%s frag_runtime=%s complete=%f" % \ 556 556 (timestamp, spec['jobid'], spec['queue'], spec['submittime'], 557 spec['nodes'], log_walltime, spec['location'], spec['start_time'],557 spec['nodes'], log_walltime, ":".join(spec['location']), spec['start_time'], 558 558 frag_runtime, round(frag_runtime / float(spec['runtime']), 2) 559 559 ) … … 561 561 message = "%s;P;%d;queue=%s qtime=%s Resource_List.ncpus=%s Resource_List.walltime=%s exec_host=%s start=%s" % \ 562 562 (timestamp, spec['jobid'], spec['queue'], spec['submittime'], 563 spec['nodes'], log_walltime, spec['location'], spec['start_time'],563 spec['nodes'], log_walltime, ":".join(spec['location']), spec['start_time'], 564 564 ) 565 565 print "message=", message … … 595 595 596 596 #release partition immediately 597 partitions = jobspec['location'] .split(':')597 partitions = jobspec['location'] 598 598 for partition in partitions: 599 599 self.release_partition(partition) … … 613 613 614 614 #release partition 615 partitions = jobspec['location'] .split(':')615 partitions = jobspec['location'] 616 616 for partition in partitions: 617 617 print "partition %s start repairing" % (partition) … … 625 625 failure_datetime = sec_to_date(fail) 626 626 self.log_job_event('F', failure_datetime, jobspec) 627 print self.get_current_time(), " job %d failed at %s!!" % (job_id, jobspec['location'])627 print self.get_current_time(), " job %d failed at %s!!" % (job_id, ":".join(jobspec['location'])) 628 628 629 629 rec_updates = self.recovery_mgr(jobspec) … … 714 714 '''update the job state and start_time and end_time when cqadm --run 715 715 is issued to a group of jobs''' 716 partitions = updates['location'] .split(':')716 partitions = updates['location'] 717 717 for partition in partitions: 718 718 self.reserve_partition(partition) … … 839 839 print "run job specs=", specs, " on partion", nodelist 840 840 if specs: 841 self.start_job(specs, {'location': ":".join(nodelist)})841 self.start_job(specs, {'location': nodelist}) 842 842 #set tag false, enable scheduling another job at the same time 843 843 self.increment_tag = False … … 867 867 return midplane_list 868 868 869 def get_next_failure(self, location, now, duration): #change for rarm869 def get_next_failure(self, location, now, duration): 870 870 '''return the next(closest) failure moment according the partition failure list''' 871 871 … … 884 884 885 885 closest_fail_sec = MAXINT 886 partitions = location .split(':')887 886 partitions = location 887 888 888 midplanes = set() 889 889 for partition in partitions: … … 1183 1183 job_end_times = {} 1184 1184 for item in end_times: 1185 job_end_times[item[0] ] = item[1]1186 1185 job_end_times[item[0][0]] = item[1] 1186 1187 1187 now = self.get_current_time_sec() 1188 1188 for p in self.cached_partitions.itervalues():
![(please configure the [header_logo] section in trac.ini)](/projects/cobalt/chrome/common/trac_banner.png)