Issue: #21536 Title: "25.0 upload does not respect sharing" Reporter: bernt-matthias Galaxy Version: 25.0
When a user has a non-shareable (private) scratch object store configured, uploading data fails with:
galaxy.objectstore.ObjectCreationProblemSharingDisabled
Error Message: "Job attempted to create sharable output datasets in a storage location with sharing disabled"
File "/gpfs1/data/galaxy_server/galaxy/lib/galaxy/jobs/runners/__init__.py", line 206, in put
queue_job = job_wrapper.enqueue()
File "/gpfs1/data/galaxy_server/galaxy/lib/galaxy/jobs/__init__.py", line 1799, in enqueue
self._set_object_store_ids(job)
File "/gpfs1/data/galaxy_server/galaxy/lib/galaxy/jobs/__init__.py", line 1825, in _set_object_store_ids
self._set_object_store_ids_full(job)
File "/gpfs1/data/galaxy_server/galaxy/lib/galaxy/jobs/__init__.py", line 1916, in _set_object_store_ids_full
object_store_populator.set_object_store_id(dataset, require_shareable=require_shareable)
File "/gpfs1/data/galaxy_server/galaxy/lib/galaxy/objectstore/__init__.py", line 2099, in set_object_store_id
self.set_dataset_object_store_id(data.dataset, require_shareable=require_shareable)
File "/gpfs1/data/galaxy_server/galaxy/lib/galaxy/objectstore/__init__.py", line 2109, in set_dataset_object_store_id
raise ObjectCreationProblemSharingDisabled()
galaxy.objectstore.ObjectCreationProblemSharingDisabled
File: /lib/galaxy/jobs/runners/__init__.py
Lines: 203-214
def put(self, job_wrapper: "MinimalJobWrapper") -> None:
"""Add a job to the queue (by job identifier), indicate that the job is ready to run."""
put_timer = ExecutionTimer()
try:
queue_job = job_wrapper.enqueue()
except Exception as e:
queue_job = False
# Required for exceptions thrown by object store incompatibility.
# tested by test/integration/objectstore/test_private_handling.py
message = e.client_message if hasattr(e, "client_message") else str(e)
job_wrapper.fail(message, exception=e)
log.debug(f"Job [{job_wrapper.job_id}] failed to queue {put_timer}")
returnFile: /lib/galaxy/jobs/__init__.py
Lines: 1781-1794
def enqueue(self):
job = self.get_job()
# Change to queued state before handing to worker thread so the runner won't pick it up again
if self.is_task:
self.change_state(Job.states.QUEUED, flush=False, job=job)
elif not self.queue_with_limit(job, self.job_destination):
return False
job.update_output_states(self.app.application_stack.supports_skip_locked())
# Set object store after job destination so can leverage parameters...
self._set_object_store_ids(job) # <-- Entry point for object store assignment
# Now that we have the object store id, check if we are over the limit
self._pause_job_if_over_quota(job)
self.sa_session.commit()
return TrueFile: /lib/galaxy/jobs/__init__.py
Lines: 1845-1924
Key snippet showing require_shareable determination:
def _set_object_store_ids_full(self, job: Job):
user = job.user
object_store_id = self.get_destination_configuration("object_store_id", None)
# ... object_store_id resolution logic ...
require_shareable = job.requires_shareable_storage(self.app.security_agent) # <-- KEY LINE
if split_object_stores is None:
object_store_populator = ObjectStorePopulator(self.app, user)
if object_store_id:
object_store_populator.object_store_id = object_store_id
for dataset_assoc in job.output_datasets + job.output_library_datasets:
dataset = dataset_assoc.dataset
object_store_populator.set_object_store_id(dataset, require_shareable=require_shareable)File: /lib/galaxy/model/__init__.py
Lines: 2215-2226
def requires_shareable_storage(self, security_agent):
# An easy optimization would be to calculate this in galaxy.tools.actions when the
# job is created and all the output permissions are already known. Having to reload
# these permissions in the job code shouldn't strictly be needed.
requires_sharing = False
for dataset_assoc in self.output_datasets + self.output_library_datasets:
if not security_agent.dataset_is_private_to_a_user(dataset_assoc.dataset.dataset):
requires_sharing = True
break
return requires_sharingFile: /lib/galaxy/model/security.py
Lines: 1152-1163
def dataset_is_private_to_a_user(self, dataset):
"""
If the Dataset object has exactly one access role and that is
the current user's private role then we consider the dataset private.
"""
access_roles = dataset.get_access_roles(self)
if len(access_roles) != 1:
return False
else:
access_role = access_roles[0]
return access_role.type == Role.types.PRIVATEFile: /lib/galaxy/objectstore/__init__.py
Lines: 2105-2116
def set_dataset_object_store_id(self, dataset: "Dataset", require_shareable: bool = True) -> None:
# Create an empty file immediately. The first dataset will be
# created in the "default" store, all others will be created in
# the same store as the first.
dataset.object_store_id = self.object_store_id
try:
concrete_store = self.object_store.create(dataset)
if concrete_store.private and require_shareable: # <-- EXCEPTION TRIGGER
raise ObjectCreationProblemSharingDisabled()
except ObjectInvalid:
raise ObjectCreationProblemStoreFull()
self.object_store_id = dataset.object_store_idFile: /lib/galaxy/tools/actions/upload_common.py
Lines: 131-151
def __new_history_upload(trans, uploaded_dataset, history=None, state=None):
if not history:
history = trans.history
hda = HistoryDatasetAssociation(
name=uploaded_dataset.name,
extension=uploaded_dataset.file_type,
dbkey=uploaded_dataset.dbkey,
history=history,
create_dataset=True,
sa_session=trans.sa_session,
)
trans.sa_session.add(hda)
# ... state setup ...
history.add_dataset(hda, genome_build=uploaded_dataset.dbkey, quota=False)
permissions = trans.app.security_agent.history_get_default_permissions(history) # <-- KEY LINE
trans.app.security_agent.set_all_dataset_permissions(hda.dataset, permissions, new=True, flush=False)
trans.sa_session.commit()
return hdaFile: /lib/galaxy/model/security.py
Lines: 865-873
def history_get_default_permissions(self, history):
permissions = {}
for dhp in history.default_permissions:
action = self.get_action(dhp.action)
if action in permissions:
permissions[action].append(dhp.role)
else:
permissions[action] = [dhp.role]
return permissionsFile: /lib/galaxy/model/security.py
Lines: 717-726
def create_user_role(self, user, app):
# Create private user role if necessary
self.get_private_user_role(user, auto_create=True)
# Create default user permissions if necessary
if not user.default_permissions:
if hasattr(app.config, "new_user_dataset_access_role_default_private"):
permissions = app.config.new_user_dataset_access_role_default_private
self.user_set_default_permissions(user, default_access_private=permissions)
else:
self.user_set_default_permissions(user, history=True, dataset=True)File: /lib/galaxy/config/schemas/config_schema.yml
Lines: 2888-2896
new_user_dataset_access_role_default_private:
type: bool
default: false
required: false
desc: |
By default, users' data will be public, but setting this to true will cause
it to be private. Does not affect existing users and data, only ones created
after this option is set. Users may still change their default back to
public.File: /test/integration/objectstore/test_private_handling.py
Two test classes exist:
TestPrivatePreventsSharingObjectStoreIntegration- Tests withnew_user_dataset_access_role_default_private = True(works)TestPrivateCannotWritePublicDataObjectStoreIntegration- Tests withnew_user_dataset_access_role_default_private = False(expects error)
class TestPrivateCannotWritePublicDataObjectStoreIntegration(BaseObjectStoreIntegrationTestCase):
@classmethod
def handle_galaxy_config_kwds(cls, config):
config["new_user_dataset_access_role_default_private"] = False # <-- This is the bug scenario
cls._configure_object_store(PRIVATE_OBJECT_STORE_CONFIG_TEMPLATE, config)
def test_both_types(self):
with self.dataset_populator.test_history() as history_id:
response = self.dataset_populator.new_dataset_request(
history_id, content=TEST_INPUT_FILES_CONTENT, wait=True, assert_ok=False
)
job = response.json()["jobs"][0]
final_state = self.dataset_populator.wait_for_job(job["id"])
assert final_state == "error" # <-- Currently this is "expected" behaviorThe issue occurs when:
- User has a private (non-shareable) object store configured (e.g., scratch storage)
- User's
new_user_dataset_access_role_default_privateis False (default Galaxy config) - User uploads a file
- Upload creates HDA: When uploading,
__new_history_upload()creates a new HDA and applies history's default permissions - Permissions are public by default: If
new_user_dataset_access_role_default_private = False, the dataset has no DATASET_ACCESS role restrictions (or multiple roles), making it "shareable" - Job enqueue checks shareability:
job.requires_shareable_storage()checks if any output dataset is NOT private to a single user - Dataset appears non-private: Since there's no single private access role,
dataset_is_private_to_a_user()returnsFalse - require_shareable becomes True: The job requires shareable storage
- Object store is private: When trying to create the dataset in the private object store, the check
concrete_store.private and require_shareabletriggersObjectCreationProblemSharingDisabled
The upload permission setup doesn't consider what object store will be used. When a private object store is the target, the dataset permissions should automatically be set to private.
Evidence:
__new_history_upload()applies history default permissions without checking object store- No coordination between object store selection and permission assignment
The object store is selected during job.enqueue() which happens after the dataset permissions are already set during the upload action.
Evidence:
- Dataset created with permissions in
upload_common.py - Object store assignment happens in
_set_object_store_ids()called fromenqueue() - By the time object store is known, permissions are already committed
The current logic requires shareable storage if dataset is NOT private. Perhaps for upload jobs specifically, the logic should be:
- If object store is private, automatically make dataset permissions private
- OR: For upload jobs, don't require shareable storage since the output hasn't been shared yet
Evidence:
- The comment in
requires_shareable_storage()says: "An easy optimization would be to calculate this in galaxy.tools.actions when the job is created" - This suggests awareness that the current timing/location of the check is suboptimal
-
Should upload tool automatically set private permissions when targeting private object store?
- Would require knowing object store at dataset creation time
- May need new API/mechanism for this
-
Should
requires_shareable_storage()have different logic for upload vs other jobs?- Upload outputs haven't been shared yet, so they don't "require" sharing
- Could check if the dataset has actually been shared rather than just permission state
-
Should user/history preferred_object_store_id influence default permissions?
- If user's preferred store is private, should their default permissions be private?
- This could be a configuration option
/lib/galaxy/tools/actions/upload_common.py- Dataset permission setup/lib/galaxy/jobs/__init__.py-_set_object_store_ids_full()andrequires_shareable_storage()call/lib/galaxy/model/__init__.py-requires_shareable_storage()logic/lib/galaxy/objectstore/__init__.py-set_dataset_object_store_id()check logic