Skip to content

Commit a38e1a7

Browse files
Merge pull request #237 from stanfordnmbl/docker-stability
dev: docker qol changes
2 parents bd8d497 + 13d4f0c commit a38e1a7

6 files changed

Lines changed: 33 additions & 24 deletions

File tree

app.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@
7979
continue
8080

8181
if r.status_code == 404:
82-
logging.info("...pulling " + workerType + " trials from " + API_URL)
82+
logging.info(f"...pulling {workerType} trials from {API_URL} "
83+
f"using commit {getCommitHash()}")
8384
time.sleep(1)
8485

8586
# When using autoscaling, we will remove the instance scale-in protection if it hasn't

docker/docker-compose.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ services:
3838
options:
3939
max-size: "100m" # Rotate when the log reaches 10MB
4040
max-file: "7" # Keep the last 7 log files
41+
restart: on-failure:3
4142
mmpose:
4243
image: ${MMPOSE_IMAGE_NAME}
4344
volumes:
@@ -55,6 +56,7 @@ services:
5556
options:
5657
max-size: "100m" # Rotate when the log reaches 10MB
5758
max-file: "7" # Keep the last 7 log files
59+
restart: on-failure:3
5860

5961
volumes:
6062
data: {}

mmpose/loop_mmpose.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ def checkCudaPyTorch():
6969
if os.path.isfile(bboxPath):
7070
os.remove(bboxPath)
7171

72-
logging.info("Done. Cleaning up")
72+
logging.info("mmpose: Done. Cleaning up")
7373

7474
except:
75-
logging.info("Pose detection failed.")
75+
logging.info("mmpose: Pose detection failed.")
7676
os.remove(video_path)

openpose/loop_openpose.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def getResolutionCommand(resolutionPoseDetection, horizontal):
9292
time.sleep(0.1)
9393
continue
9494

95-
logging.info("Processing...")
95+
logging.info("Processing openpose...")
9696

9797
if os.path.isdir(output_dir):
9898
shutil.rmtree(output_dir)
@@ -101,13 +101,18 @@ def getResolutionCommand(resolutionPoseDetection, horizontal):
101101
horizontal = getVideoOrientation(video_path)
102102
cmd_hr = getResolutionCommand(resolutionPoseDetection, horizontal)
103103

104-
check_cuda_device()
105-
command = "/openpose/build/examples/openpose/openpose.bin\
106-
--video {video_path}\
107-
--display 0\
108-
--write_json {output_dir}\
109-
--render_pose 0{cmd_hr}".format(video_path=video_path, output_dir=output_dir, cmd_hr=cmd_hr)
110-
os.system(command)
104+
try:
105+
check_cuda_device()
106+
command = "/openpose/build/examples/openpose/openpose.bin\
107+
--video {video_path}\
108+
--display 0\
109+
--write_json {output_dir}\
110+
--render_pose 0{cmd_hr}".format(video_path=video_path, output_dir=output_dir, cmd_hr=cmd_hr)
111+
os.system(command)
111112

112-
logging.info("Done. Cleaning up")
113-
os.remove(video_path)
113+
logging.info("openpose: Done. Cleaning up")
114+
os.remove(video_path)
115+
116+
except:
117+
logging.info("openpose: Pose detection failed.")
118+
os.remove(video_path)

utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,7 +1586,7 @@ def checkCudaTF():
15861586
sendStatusEmail(message=message)
15871587
raise Exception("No GPU detected. Exiting.")
15881588

1589-
def writeToJsonLog(path, new_dict, max_entries=1000):
1589+
def writeToJsonLog(path, new_dict, max_entries=1000, indent=2):
15901590
dir_name = os.path.dirname(path)
15911591
if not os.path.exists(dir_name):
15921592
os.makedirs(dir_name)
@@ -1603,7 +1603,7 @@ def writeToJsonLog(path, new_dict, max_entries=1000):
16031603
data.pop(0)
16041604

16051605
with open(path, 'w') as f:
1606-
json.dump(data, f)
1606+
json.dump(data, f, indent=indent)
16071607

16081608
def writeToErrorLog(path, session_id, trial_id, error, stack, max_entries=1000):
16091609
error_entry = {

utilsServer.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import logging
77
import time
88
import random
9+
import urllib
910

1011
from main import main
1112
from utils import getDataDirectory
@@ -500,28 +501,28 @@ def runTestSession(pose='all',isDocker=True,maxNumTries=3):
500501
logging.info("\n\n\nStatus check succeeded. \n\n")
501502
return
502503

503-
# Catch and re-enter while loop if it's an HTTPError (could be more
504-
# than just 404 errors). Wait between 30 and 60 seconds before
505-
# retrying.
506-
except requests.exceptions.HTTPError as e:
504+
# Catch and re-enter while loop if it's an HTTPError or URLError
505+
# (could be more than just 404 errors). Wait between 30 and 60 seconds
506+
# before retrying.
507+
except (requests.exceptions.HTTPError, urllib.error.URLError) as e:
507508
if numTries < maxNumTries:
508-
logging.info(f"test trial failed on try #{numTries} due to HTTPError. Retrying.")
509+
logging.info(f"test trial failed on try #{numTries} due to HTTPError or URLError. Retrying.")
509510
wait_time = random.randint(30,60)
510511
logging.info(f"waiting {wait_time} seconds then retrying...")
511512
time.sleep(wait_time)
512513
continue
513514
else:
514-
logging.info(f"test trial failed on try #{numTries} due to HTTPError.")
515+
logging.info(f"test trial failed on try #{numTries} due to HTTPError or URLError.")
515516
# send email
516-
message = "A backend OpenCap machine failed the status check (HTTPError). It has been stopped."
517+
message = "A backend OpenCap machine failed the status check (HTTPError or URLError). It has been stopped."
517518
sendStatusEmail(message=message)
518-
raise Exception('Failed status check (HTTPError). Stopped.')
519+
raise Exception('Failed status check (HTTPError or URLError). Stopped.')
519520

520521
# Catch other errors and stop
521522
except:
522523
logging.info("test trial failed. stopping machine.")
523524
# send email
524-
message = "A backend OpenCap machine failed the status check. It has been stopped."
525+
message = "A backend OpenCap machine failed the status check (not HTTPError or URLError). It has been stopped."
525526
sendStatusEmail(message=message)
526527
raise Exception('Failed status check. Stopped.')
527528

0 commit comments

Comments
 (0)