openscilab · sepandhaghighi · Jul 1, 2025 · Jun 3, 2025 · Jun 3, 2025 · Jun 3, 2025
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,205 @@
+@article{Raschka2020,
+  author    = {Sebastian Raschka and Joshua Patterson and Corey Nolet},
+  title     = {Machine learning in Python: Main developments and technology trends in data science, machine learning, and artificial intelligence},
+  journal   = {Information},
+  volume    = {11},
+  number    = {4},
+  pages     = {193},
+  year      = {2020}
+}
+
+@article{Garbin2022,
+  author    = {Cristina Garbin and Osvaldo Marques},
+  title     = {Assessing methods and tools to improve reporting, increase transparency, and reduce failures in machine learning applications in health care},
+  journal   = {Radiology: Artificial Intelligence},
+  volume    = {4},
+  number    = {2},
+  pages     = {e210127},
+  year      = {2022}
+}
+
+@misc{Brownlee2018,
+  author    = {Jason Brownlee},
+  title     = {Save and load machine learning models in Python with scikit-learn},
+  howpublished = {\url{https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/}},
+  year      = {2018},
+  note      = {Accessed: 2024-05-22}
+}
+
+@mastersthesis{Verma2023,
+  author    = {Ankit Verma},
+  title     = {Insecure deserialization detection in Python},
+  school    = {San Jose State University},
+  year      = {2023},
+  type      = {Master's Project}
+}
+
+@misc{ONNX2017,
+  author    = {Chi-Wing Chen and Ganesan Ramalingam},
+  title     = {ONNX},
+  year      = {2017},
+  howpublished = {\url{https://github.com/onnx/onnx}}
+}
+
+@article{Guazzelli2009,
+  author    = {Alex Guazzelli and Michael Zeller and Wen-Ching Lin and Graham Williams},
+  title     = {PMML: An open standard for sharing models},
+  journal   = {The R Journal},
+  volume    = {1},
+  number    = {1},
+  pages     = {60--65},
+  year      = {2009}
+}
+
+@article{Wang2020,
+  author    = {Ling Wang and Ping Zhang},
+  title     = {ONNX export for machine learning models: Issues with accuracy degradation},
+  journal   = {IEEE Transactions on AI Systems},
+  volume    = {35},
+  pages     = {123--135},
+  year      = {2020}
+}
+
+@misc{Noyan2023,
+  author    = {Mehmet Noyan},
+  title     = {SKOPS: A new library to improve scikit-learn in production},
+  howpublished = {\url{https://www.kdnuggets.com/2023/02/skops-new-library-improve-scikitlearn-production.html}},
+  year      = {2023},
+  month     = {Feb}
+}
+
+@misc{TFJS2018,
+  author    = {Ping Yu and Daniel Smilkov},
+  title     = {TensorFlow.js},
+  year      = {2018},
+  howpublished = {\url{https://github.com/tensorflow/tfjs}}
+}
+
+@misc{NerdCorner2025,
+  author    = {{Nerd Corner}},
+  title     = {TensorFlow.js vs TensorFlow (Python) -- Pros and cons},
+  year      = {2025},
+  month     = {Mar},
+  howpublished = {\url{https://nerd-corner.com/tensorflow-js-vs-tensorflow-python/}}
+}
+
+@misc{tensorflow2015-whitepaper,
+title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
+url={https://www.tensorflow.org/},
+note={Software available from tensorflow.org},
+author={
+    Mart\'{i}n~Abadi and
+    Ashish~Agarwal and
+    Paul~Barham and
+    Eugene~Brevdo and
+    Zhifeng~Chen and
+    Craig~Citro and
+    Greg~S.~Corrado and
+    Andy~Davis and
+    Jeffrey~Dean and
+    Matthieu~Devin and
+    Sanjay~Ghemawat and
+    Ian~Goodfellow and
+    Andrew~Harp and
+    Geoffrey~Irving and
+    Michael~Isard and
+    Yangqing Jia and
+    Rafal~Jozefowicz and
+    Lukasz~Kaiser and
+    Manjunath~Kudlur and
+    Josh~Levenberg and
+    Dandelion~Man\'{e} and
+    Rajat~Monga and
+    Sherry~Moore and
+    Derek~Murray and
+    Chris~Olah and
+    Mike~Schuster and
+    Jonathon~Shlens and
+    Benoit~Steiner and
+    Ilya~Sutskever and
+    Kunal~Talwar and
+    Paul~Tucker and
+    Vincent~Vanhoucke and
+    Vijay~Vasudevan and
+    Fernanda~Vi\'{e}gas and
+    Oriol~Vinyals and
+    Pete~Warden and
+    Martin~Wattenberg and
+    Martin~Wicke and
+    Yuan~Yu and
+    Xiaoqiang~Zheng},
+  year={2015},
+}
+
+@inproceedings{rauker2023toward,
+  title={Toward transparent ai: A survey on interpreting the inner structures of deep neural networks},
+  author={R{\"a}uker, Tilman and Ho, Anson and Casper, Stephen and Hadfield-Menell, Dylan},
+  booktitle={2023 IEEE Conference on Secure and Trustworthy Machine Learning (SaTML)},
+  pages={464--483},
+  year={2023},
+  organization={IEEE},
+  doi={10.1109/SaTML54575.2023.00039}
+}
+
+@article{bodimani2024assessing,
+  title={Assessing The Impact of Transparent AI Systems in Enhancing User Trust and Privacy},
+  author={Bodimani, Meghasai},
+  journal={Journal of Science \& Technology},
+  volume={5},
+  number={1},
+  pages={50--67},
+  year={2024},
+  doi={10.55662/JST.2024.5102}
+}
+
+@article{macrae2019governing,
+  title={Governing the safety of artificial intelligence in healthcare},
+  author={Macrae, Carl},
+  journal={BMJ quality \& safety},
+  volume={28},
+  number={6},
+  pages={495--498},
+  year={2019},
+  publisher={BMJ Publishing Group Ltd},
+  doi={10.1136/bmjqs-2019-009484}
+}
+
+@inproceedings{davis2023reusing,
+  title={Reusing deep learning models: Challenges and directions in software engineering},
+  author={Davis, James C and Jajal, Purvish and Jiang, Wenxin and Schorlemmer, Taylor R and Synovic, Nicholas and Thiruvathukal, George K},
+  booktitle={2023 IEEE John Vincent Atanasoff International Symposium on Modern Computing (JVA)},
+  pages={17--30},
+  year={2023},
+  organization={IEEE}
+}
+
+@article{parida2025model,
+  title={How Do Model Export Formats Impact the Development of ML-Enabled Systems? A Case Study on Model Integration},
+  author={Parida, Shreyas Kumar and Gerostathopoulos, Ilias and Bogner, Justus},
+  journal={arXiv preprint arXiv:2502.00429},
+  year={2025}
+}
+
+@article{jajal2023analysis,
+  title={Analysis of failures and risks in deep learning model converters: A case study in the onnx ecosystem},
+  author={Jajal, Purvish and Jiang, Wenxin and Tewari, Arav and Kocinare, Erik and Woo, Joseph and Sarraf, Anusha and Lu, Yung-Hsiang and Thiruvathukal, George K and Davis, James C},
+  journal={arXiv preprint arXiv:2303.17708},
+  year={2023}
+}
+
+@inproceedings{cody2024extending,
+  title={On extending the automatic test markup language (ATML) for machine learning},
+  author={Cody, Tyler and Li, Bingtong and Beling, Peter},
+  booktitle={2024 IEEE International Systems Conference (SysCon)},
+  pages={1--8},
+  year={2024},
+  organization={IEEE}
+}
+
+@inproceedings{quan2022towards,
+  title={Towards understanding the faults of javascript-based deep learning systems},
+  author={Quan, Lili and Guo, Qianyu and Xie, Xiaofei and Chen, Sen and Li, Xiaohong and Liu, Yang},
+  booktitle={Proceedings of the 37th IEEE/ACM International Conference on Automated Software Engineering},
+  pages={1--13},
+  year={2022}
+}
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,94 @@
+---
+title: 'PyMilo: A Python Library for ML I/O'
+tags:
+  - Machine Learning 
+  - Model Deployment
+  - Model Serialization
+  - Transparency
+  - MLOPS
+authors:
+  - name: AmirHosein Rostami
+    orcid: 0009-0000-0638-2263
+    corresponding: true
+    affiliation: 1 
+  - name: Sepand Haghighi
+    orcid: 0000-0001-9450-2375
+    corresponding: false
+    affiliation: 1
+  - name: Sadra Sabouri
+    orcid: 0000-0003-1047-2346
+    corresponding: false
+    affiliation: 1
+  - name: Alireza Zolanvari
+    orcid: 0000-0003-2367-8343
+    corresponding: false
+    affiliation: 1
+affiliations:
+ - name: Open Science Lab
+   index: 1
+
+date: 10 June 2025
+bibliography: paper.bib
+---
+
+# Summary
+PyMilo is an open-source Python package that addresses the limitations of existing machine learning (ML) model storage formats by providing a transparent, reliable, end-to-end, and safe method for exporting and deploying trained models. 
+Current tools rely on black-box or executable formats that obscure internal model structures, making them difficult to audit, verify, or safely share. 
+Others apply structural transformations during export that may degrade predictive performance and reduce the model to a limited inference-only interface. 
+In contrast, PyMilo serializes models in a transparent human-readable format that preserves end-to-end model fidelity and enables reliable, safe, and interpretable exchange. 
+This package is designed to make the preservation and reuse of trained ML models safer, more interpretable, and easier to manage across different stages of the ML workflow (\autoref{fig:overall}).
+
+![PyMilo is an end-to-end, transparent, and safe solution for transporting models from machine learning frameworks to the target devices. PyMilo preserves the original model's structure while transferring, allowing it to be imported back as the exact same object in its native framework.\label{fig:overall}](pymilo_outlook.png)
+
+\newpage
+
+# Statement of Need
+Modern machine learning development is largely centered around the Python ecosystem, which has become a dominant platform for building and training models due to its rich libraries and community support [@Raschka2020]. 
+However, once a model is trained, sharing or deploying it securely and transparently remains a significant challenge [@parida2025model; @davis2023reusing]. This issue is especially important in high-stake domains such as healthcare, where ensuring model accountability and integrity is critical [@Garbin2022].
+In such settings, any lack of clarity about a model’s internal logic or origin can reduce trust in its predictions. Researchers have increasingly emphasized that greater transparency in AI systems is critical for maintaining user trust and protecting privacy in machine learning applications [@bodimani2024assessing].
+
+Despite ongoing concerns around transparency and safety, the dominant approach for exchanging pretrained models remains ad hoc binary serialization, most commonly through Python’s `pickle` module or its variant `joblib`. 
+These formats allow developers to store complex model objects with minimal effort, but they were never designed with security or human interpretability in mind [@parida2025model]. In fact, loading a pickle file may execute arbitrary code contained within it, a known vulnerability that can be exploited if the file is maliciously crafted [@Brownlee2018]. 
+While these methods preserves full model fidelity within the Python ecosystem, it poses serious security risks and lacks transparency, as the serialized files are opaque binary blobs that cannot be inspected without loading. 
+Furthermore, compatibility is fragile because pickled models often depend on specific library versions, which may hinder long-term reproducibility [@Brownlee2018].
+
+To improve portability across environments, several standardized model interchange formats have been developed alongside `pickle`. 
+Most notably, Open Neural Network Exchange (ONNX) and Predictive Model Markup Language (PMML) convert trained models into framework-agnostic representations [@Verma2023; @ONNX2017], enabling deployment in diverse systems without relying on the original training code. 
+ONNX uses a graph-based structure built from primitive operators (e.g., linear transforms, activations), while PMML provides an XML-based specification for traditional models like decision trees and regressions.
+
+Although these formats enhance security by avoiding executable serialization, they introduce compatibility and fidelity challenges. 
+Exporting complex pipelines to ONNX or PMML often leads to structural approximations, missing metadata, or unsupported components, especially for customized models [@Guazzelli2009; @Wang2020]. 
+As a result, the exported model may differ in behavior, resulting in performance degradation or loss of accuracy [@jajal2023analysis]. 
+For example Wang et. al. reported accuracy drops of up to 10 to 15 percent after exporting models to ONNX in certain scenarios [@Wang2020]. This highlights the risk of behavioral drift between the original and exported versions.
+
+Beyond concerns about end-to-end model preservation, ONNX and PMML also present limitations in transparency, scope, and reversibility. ONNX uses a binary protocol buffer format that is not human-readable, which limits interpretability and makes auditing difficult. 
+PMML, although readable, is verbose and narrowly scoped, supporting only a limited subset of scikit-learn models [@cody2024extending]. Moreover, PMML does not provide a way to restore exported models back into Python, making it a one-way format unsuitable for end-to-end workflows.
+
+Other tools have been developed to address specific use cases, though they remain limited in scope. 
+SKOPS improves the safety of scikit-learn model storage by avoiding executable serialization and enabling limited inspection of model contents [@Noyan2023]. 
+However, it supports only scikit-learn models, lacks compatibility with other frameworks, and does not provide a fully transparent or human-readable structure. 
+TensorFlow.js targets JavaScript environments by converting TensorFlow or Keras models into JSON and binary weight files for browser-based execution [@TFJS2018]. 
+This process requires significant modifications to the original model architecture, which often leads to compatibility issues, degraded performance, and changes in inference time [@quan2022towards]. 
+Models from other frameworks, such as scikit-learn or PyTorch, must be re-implemented or retrained in TensorFlow to be exported. 
+Additionally, running complex models in JavaScript runtimes introduces memory and speed limitations, making deployment of large neural networks prohibitively slow or even infeasible in the browser context [@NerdCorner2025].
+
+In summary, current solutions force practitioners into a trade-offs between security, transparency, end-to-end fidelity, and performance preservation (see Table 1). 
+The machine learning community still lacks a safe and transparent end-to-end model serialization framework through which users can securely share models, inspect them easily, and accurately reconstruct them for use across diverse frameworks and environments.
+
+**Table 1**: Comparison of PyMilo with existing model serialization tools.
+
+| Package           | Transparent | Multi-Framework | End-to-End Preservation | Secure |
+|------------------|-------------|------------------|--------------------------|--------|
+| **Pickle**        | No          | Yes              | Yes                      | No     |
+| **Joblib**        | No          | Yes              | Yes                      | No     |
+| **ONNX**          | No          | Yes              | No                       | Yes    |
+| **PMML**          | Yes         | No               | No                       | Yes    |
+| **SKOPS**         | No          | No               | Yes                      | Yes    |
+| **TensorFlow.js** | Yes         | No               | No                       | Yes    |
+| **PyMilo**        | Yes         | Yes              | Yes                      | Yes    |
+
+PyMilo is proposed to address the above gaps. It is an open-source Python library that provides an end-to-end solution for exporting and importing machine learning models in a safe, non-executable, and human-readable format such as JSON. PyMilo serializes trained models into a transparent format and fully reconstructs them without structural changes, preserving their original functionality and behavior. 
+This process does not affect inference time or performance and imports models on any target device without additional dependencies, enabling seamless execution in inference mode. 
+PyMilo benefits a wide range of stakeholders, including machine learning engineers, data scientists, and AI practitioners, by facilitating the development of more transparent and accountable AI systems. Furthermore, researchers working on transparent AI [@rauker2023toward], user privacy in ML [@bodimani2024assessing], and safe AI [@macrae2019governing] can use PyMilo as a framework that provides transparency and safety in the machine learning environment.
+
+# References
diff --git a/paper/pymilo_outlook.png b/paper/pymilo_outlook.png