pr

2022-09-20 11:16:06 +08:00 · 2022-09-20 11:16:06 +08:00 · d38a64807f
parent f6afaabcdf
commit d38a64807f
158 changed files with 11411 additions and 11731 deletions
--- a/12
+++ b/12
@ -1,12 +0,0 @@
-                                  YOLO LICENSE
-                             Version 2, July 29 2016
-
-THIS SOFTWARE LICENSE IS PROVIDED "ALL CAPS" SO THAT YOU KNOW IT IS SUPER
-SERIOUS AND YOU DON'T MESS AROUND WITH COPYRIGHT LAW BECAUSE YOU WILL GET IN
-TROUBLE HERE ARE SOME OTHER BUZZWORDS COMMONLY IN THESE THINGS WARRANTIES
-LIABILITY CONTRACT TORT LIABLE CLAIMS RESTRICTION MERCHANTABILITY. NOW HERE'S
-THE REAL LICENSE:
-
-0. Darknet is public domain.
-1. Do whatever you want with it.
-2. Stop emailing me about it!
--- a/LICENSE.fuck
+++ b/LICENSE.fuck
@ -1,13 +0,0 @@
-           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
-                   Version 2, December 2004
-
-Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
-
-Everyone is permitted to copy and distribute verbatim or modified
-copies of this license document, and changing it is allowed as long
-as the name is changed.
-
-           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
-  TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
- 0. You just DO WHAT THE FUCK YOU WANT TO.
--- a/LICENSE.gen
+++ b/LICENSE.gen
@ -1,91 +0,0 @@
-RNN LICENSE Version 3, June 21 2017
-
-Copyright (c) 1990, 1989, 1999 Free87337 May 48 THIRD PARTIES OR ANY OTHER THE
-COMPLAIN OR CONSEQUENTIAL DAMAGES AND REGARDLESS OF WHETHER IN CONTRACT, TO THE
-EXTENT REPAIR OR AGENTS (NOT THE IN ANY EVENT). THE SOFTWARE WILL BE
-UNINTERRUPTED OR ERROR-FREE OR ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF ALL THE WORK (GOVERNED CODE) HIM RESPONSES, OR OF FINES,
-SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR ANY OTHER OR OTHER HARL UNDER NO
-CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING NEGLIGENCE),
-PATENT PERMITTED BY THE INSTAGRAM PARENT STATE OR TORT (INCLUDING NEGLIGENCE),
-PRODUCT LIABILITY OR OTHERWISE, ARISING OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR ANYTHING PROVIDED IN THIS PRODUCT, COMMIS AND SERVICES
-ARE LICENSED SOFTWARE AND ANY RESULE OR ANY OTHER THE COPYRIGHT HOLDERS BE
-LIABLE FOR ANY SPECIAL, INCIDENTAL, CASE, SUCH WARRANTIES, EXPRESS OR IMPLIED,
-INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COPYRIGHT HOLDERS AND/OR ANY
-PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY
-EXPRESS OR DISTRIBUTE THAT ALL CLAIMS ARE SHALL CREATE DERAVE BE LIABLE TO YOU
-WILL HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-6\. TERMINATION. TO THE EXTENT PERMITTED BY LAW, NO USE OF THE COVERED CODE IS
-WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE
-INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY
-SERVICING, REPAIR OR COULT OR IN ANY WAY OUT OF THE USE OF THE WEBSITES OR
-SERVICE WILL BE CONSEQUENTIAL DAMAGES OF ANY KIND HAS BEEN ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGES.
-
-
-This paragraph Agreement constitutes the entire agreement between the parties
-with respect to the Work licensed here. However, if you place the name of the
-fact that the arbitration was the consultation of the parties as a "patent is".
-Subject to the terms and conditions of this License, Contributor has knowledge
-that a license under a third party may also be used to endorse or promote
-products derived from the Work, and there is no warranty on the Software and
-Science Fees. For the purposes of this Agreement, attach the following
-disclaimers (without liabilities of written notice to the Subject Software) in a
-manner that a product is under common control with you. The Free Software
-Foundation may publish revised and/or new versions of the License for the
-Modifications made by the applicable terms. The Recipient shall promptly retain
-the covered works for any reason be entered in any federal or state or login
-Restricted Laws appearing in the United States or any of its own information
-that is not disabled from a derivative work except as expressly permitted in
-this License, to the extent that they are in receiving the Software and Source
-Code or any exercise of the rights granted to You by this License or a
-Contributor made by the Licensor or are authorized to make a reasonable
-retirement by the courts of the courts located in Santa Clara County, California
-printed and related to the Work or “Company” and Apache Software Foundation. If
-the Licensor shall be entitled to reflect your rights to use the Software and
-the Software to exercise the rights granted to the recipient without a
-requirement to exercise the rights granted by the Agreement to the provision
-will begin will appear in such cases, you will use such information without such
-corporation shall be an officer with respect to any part of the Software or any
-portion thereof. Capitalized terms are included in the Initial Contributor and
-under no circumstances will license the Service at any time and for any direct,
-indirect, special, incidental, or consequential damages of or assist in
-connection with any Services or the registration purposes only to the extent
-that it includes any or all means including the processing of which you download
-any derivative work. Any of the purchases’ transmission purposes are made
-available, if any, in other circumstances, we may review the copyright notice.
-In the event that this Agreement is required to give us strict content. The
-inclusion of the other party hereunder may also notify you Intellectual Property
-Rights to any third party. This means that the Source Code exists of the Work
-will not charge a program available to you at any time. You must include a
-prominent statement that the Software is governed under a particular version of
-this Agreement. You must include a provision to the extent that there is no
-warranty for the content of others. You agree that the Recipient was appointed
-as a Contributor, (c) are effective until terminated by hereunder, then the
-registration are not disabled and not limited to, submit any Customer Data
-without the updated use of the Software and that no fee is released. You grant
-to Use Other Arbitration Rules for Diagnostic or Services may use or modify the
-Apple Software and Consolidated Apple Software or Services. The Company may have
-full risk as a product of the Compatible Source. A Contribution by the Licensor
-or by the updated Software under the following conditions we can redistribute
-any General Provision of this Agreement. If the Program is used in accordance
-with the terms of this Agreement, Customer may provide advertisements from your
-devices that clause you can your employer or a transaction or country that has
-been controlled by the arbitrator, that they will be useful of this Agreement.
-The term "Open Source Software is available in connection with the program, and
-you may not protect the combination of the Covered Code. You should like to
-select a user's rights to charge a copy of this License. I are Contributor's
-confidentiality of the exercise of the rights granted herein. Such a covered
-work is released as a consequence, the Licensor shall be eligible for a purpose
-or subcontractor of the person or entity to the user of the user, then the word
-"Application" means having the original fee for any reason; and that no patent
-license to more than fifty stated close of the license term. The terms of this
-License will the license terms and conditions set forth in Section 2.2 (OPEC)
-and You will not use the Software or any set of responsibility for any resulting
-information that the Original Code warrants that you have the right to disclose
-these information (or in the notification; or (iii) late use of the software or
-any third party to the three (50) days before such belief to the extent that it
-includes a court court obtains the rights granted by this License.
--- a/LICENSE.gpl
+++ b/LICENSE.gpl
@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    {one line to give the program's name and a brief idea of what it does.}
-    Copyright (C) {year}  {name of author}
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    {project}  Copyright (C) {year}  {fullname}
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
--- a/LICENSE.meta
+++ b/LICENSE.meta
@ -1,8 +0,0 @@
-                          META-LICENSE
-                    Version 1, June 21 2017
-
-Any and all licenses may be applied to the software either individually
-or in concert. Any issues, ambiguities, paradoxes, or metaphysical quandries
-arising from this combination should be discussed with a local faith leader,
-hermit, or guru. The Oxford comma shall be used.
-
--- a/LICENSE.mit
+++ b/LICENSE.mit
@ -1,22 +0,0 @@
-MIT License
-
-Copyright (c) 2017 Joseph Redmon
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
--- a/LICENSE.v1
+++ b/LICENSE.v1
@ -1,13 +0,0 @@
-                                  YOLO LICENSE
-                             Version 1, July 10 2015
-
-THIS SOFTWARE LICENSE IS PROVIDED "ALL CAPS" SO THAT YOU KNOW IT IS SUPER
-SERIOUS AND YOU DON'T MESS AROUND WITH COPYRIGHT LAW BECAUSE YOU WILL GET IN
-TROUBLE HERE ARE SOME OTHER BUZZWORDS COMMONLY IN THESE THINGS WARRANTIES
-LIABILITY CONTRACT TORT LIABLE CLAIMS RESTRICTION MERCHANTABILITY SUBJECT TO
-THE FOLLOWING CONDITIONS:
-
-1. #yolo
-2. #swag
-3. #blazeit
-
--- a/73
+++ b/73
@ -1,17 +1,9 @@
-GPU=0
+GPU=1
 CUDNN=0
 OPENCV=0
 OPENMP=0
 DEBUG=0

-ARCH= -gencode arch=compute_30,code=sm_30 \
-      -gencode arch=compute_35,code=sm_35 \
-      -gencode arch=compute_50,code=[sm_50,compute_50] \
-      -gencode arch=compute_52,code=[sm_52,compute_52]
-#      -gencode arch=compute_20,code=[sm_20,sm_21] \ This one is deprecated?
-
-# This is what I use, uncomment if you know your arch and want to specify
-# ARCH= -gencode arch=compute_52,code=compute_52

 VPATH=./src/:./examples
 SLIB=libdarknet.so
@ -19,49 +11,69 @@ ALIB=libdarknet.a
 EXEC=darknet
 OBJDIR=./obj/

-CC=gcc
-CPP=g++
-NVCC=nvcc 
+# 设置编译参数
 AR=ar
 ARFLAGS=rcs
 OPTS=-Ofast
-LDFLAGS= -lm -pthread 
+LDFLAGS= -lm -pthread
 COMMON= -Iinclude/ -Isrc/
-CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC
+CFLAGS= -Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -Wno-write-strings -fPIC

-ifeq ($(OPENMP), 1) 
+CC=gcc
+CPP=g++
+ifeq ($(GPU), 1)
+HIP_ROOT_PATH=/opt/dtk-22.04.2
+
+CC=${HIP_ROOT_PATH}/bin/hipcc
+CPP=${HIP_ROOT_PATH}/bin/hipcc
+NVCC=${HIP_ROOT_PATH}/bin/hipcc
+COMMON+=  -DGPU -I${HIP_ROOT_PATH}/include/ -I${HIP_ROOT_PATH}/rocrand/include/ -I${HIP_ROOT_PATH}/hiprand/include/ -I${HIP_ROOT_PATH}/hipblas/include/
+CFLAGS+= -DGPU -D__HIP_PLATFORM_HCC__
+LDFLAGS+= -L${HIP_ROOT_PATH}/lib64 -lhipblas -lhiprand
+endif
+
+
+ifeq ($(OPENMP), 1)
 CFLAGS+= -fopenmp
 endif

-ifeq ($(DEBUG), 1) 
+ifeq ($(DEBUG), 1)
 OPTS=-O0 -g
 endif

 CFLAGS+=$(OPTS)

-ifeq ($(OPENCV), 1) 
+ifeq ($(OPENCV), 1)
 COMMON+= -DOPENCV
 CFLAGS+= -DOPENCV
 LDFLAGS+= `pkg-config --libs opencv` -lstdc++
-COMMON+= `pkg-config --cflags opencv` 
+COMMON+= `pkg-config --cflags opencv`
 endif

-ifeq ($(GPU), 1) 
-COMMON+= -DGPU -I/usr/local/cuda/include/
-CFLAGS+= -DGPU
-LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
-endif
+#ifeq ($(GPU), 1)
+#COMMON+= -DGPU -I/usr/local/cuda/include/
+#CFLAGS+= -DGPU
+#LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
+#endif

-ifeq ($(CUDNN), 1) 
-COMMON+= -DCUDNN 
+ifeq ($(CUDNN), 1)
+COMMON+= -DCUDNN
 CFLAGS+= -DCUDNN
 LDFLAGS+= -lcudnn
 endif

-OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o upsample_layer.o box.o normalization_layer.o avgpool_layer.o layer.o local_layer.o shortcut_layer.o logistic_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o reorg_layer.o tree.o  lstm_layer.o l2norm_layer.o yolo_layer.o iseg_layer.o image_opencv.o
+OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o \
+softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o upsample_layer.o box.o normalization_layer.o \
+avgpool_layer.o layer.o local_layer.o shortcut_layer.o logistic_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o \
+reorg_layer.o tree.o  lstm_layer.o l2norm_layer.o yolo_layer.o iseg_layer.o
+ifeq ($(OPENCV), 1)
+OBJ+=image_opencv.o
+endif
+
 EXECOBJA=captcha.o lsd.o super.o art.o tag.o cifar.o go.o rnn.o segmenter.o regressor.o classifier.o coco.o yolo.o detector.o nightmare.o instance-segmenter.o darknet.o
-ifeq ($(GPU), 1) 
-LDFLAGS+= -lstdc++ 
+#EXECOBJA=darknet.o
+ifeq ($(GPU), 1)
+LDFLAGS+= -lstdc++
 OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o avgpool_layer_kernels.o
 endif

@ -72,9 +84,8 @@ DEPS = $(wildcard src/*.h) Makefile include/darknet.h
 all: obj backup results $(SLIB) $(ALIB) $(EXEC)
 #all: obj  results $(SLIB) $(ALIB) $(EXEC)

-
 $(EXEC): $(EXECOBJ) $(ALIB)
-	$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(ALIB)
+	$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)

 $(ALIB): $(OBJS)
 	$(AR) $(ARFLAGS) $@ $^
@ -89,7 +100,7 @@ $(OBJDIR)%.o: %.c $(DEPS)
 	$(CC) $(COMMON) $(CFLAGS) -c $< -o $@

 $(OBJDIR)%.o: %.cu $(DEPS)
-	$(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@
+	$(NVCC) -c $< -o $@ $(COMMON) $(CFLAGS)

 obj:
 	mkdir -p obj
--- a/README.md
+++ b/README.md
@ -1,3 +1,5 @@
+# Darknet
+
 ![Darknet Logo](http://pjreddie.com/media/files/darknet-black-small.png)

 # Darknet #
--- a/cfg/voc.data
+++ b/cfg/voc.data
@ -1,6 +1,6 @@
 classes= 20
-train  = /home/pjreddie/data/voc/train.txt
-valid  = /home/pjreddie/data/voc/2007_test.txt
+train  = /home/public/DL_DATA/VOCdevkit0712/voc2007_2012/train.txt
+valid  = /home/public/DL_DATA/VOCdevkit0712/voc2007_2012/2007_test.txt
 names = data/voc.names
 backup = backup

--- a/cfg/yolov3-voc.cfg
+++ b/cfg/yolov3-voc.cfg
@ -1,10 +1,11 @@
 [net]
 # Testing
- batch=1
- subdivisions=1
+ batch=64
+ subdivisions=16
 # Training
 # batch=64
 # subdivisions=16
+
 width=416
 height=416
 channels=3
--- a/cfg/yolov3.cfg
+++ b/cfg/yolov3.cfg
@ -17,7 +17,7 @@ hue=.1

 learning_rate=0.001
 burn_in=1000
-max_batches = 500200
+max_batches = 500303
 policy=steps
 steps=400000,450000
 scales=.1,.1
--- a/darknet_using_yolov3.sh
+++ b/darknet_using_yolov3.sh
@ -0,0 +1,17 @@
+model test:
+  ./darknet detect cfg/yolov3.cfg ./model_pretrained/yolov3.weights data/dog.jpg
+  ./darknet detect cfg/yolov3.cfg ./model_pretrained/yolov3.weights data/giraffe.jpg
+  ./darknet detect cfg/yolov3.cfg ../darknet_official/model_pretrained/yolov3.weights data/giraffe.jpg
+
+
+train model on voc:
+  data preprocess:
+    1.download voc data, 2007 and 2012, then copy file 'scripts/voc_label.py' to voc folder
+    2.run voc_label.py, and change path of the cfg/voc.data
+    *3.change parameters in 'cfg/yolov3.cfg' if need
+
+  train YOLOv3:
+    ./darknet detector train cfg/voc.data  cfg/yolov3.cfg ./model_pretrained/yolov3.weights
+
+  test after train:
+    ./darknet detect cfg/yolov3.cfg ./backup/yolov3_final.weights data/dog.jpg
--- a/examples/art.cpp
+++ b/examples/art.cpp
--- a/examples/attention.cpp
+++ b/examples/attention.cpp
@ -7,7 +7,7 @@ void extend_data_truth(data *d, int n, float val)
 {
    int i, j;
    for(i = 0; i < d->y.rows; ++i){
-        d->y.vals[i] = realloc(d->y.vals[i], (d->y.cols+n)*sizeof(float));
+        d->y.vals[i] = (float *) realloc(d->y.vals[i], (d->y.cols+n)*sizeof(float));
        for(j = 0; j < n; ++j){
            d->y.vals[i][d->y.cols + j] = val;
        }
@ -20,8 +20,8 @@ matrix network_loss_data(network *net, data test)
    int i,b;
    int k = 1;
    matrix pred = make_matrix(test.X.rows, k);
-    float *X = calloc(net->batch*test.X.cols, sizeof(float));
-    float *y = calloc(net->batch*test.y.cols, sizeof(float));
+    float *X = (float*) calloc(net->batch*test.X.cols, sizeof(float));
+    float *y = (float*) calloc(net->batch*test.y.cols, sizeof(float));
    for(i = 0; i < test.X.rows; i += net->batch){
        for(b = 0; b < net->batch; ++b){
            if(i+b == test.X.rows) break;
@ -60,7 +60,7 @@ void train_attention(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
    printf("%d\n", ngpus);
-    network **nets = calloc(ngpus, sizeof(network*));
+    network **nets = (network **)calloc(ngpus, sizeof(network*));

    srand(time(0));
    int seed = rand();
@ -152,7 +152,7 @@ void train_attention(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
                free_matrix(deltas);
            }
        }
-        int *inds = calloc(resized.y.rows, sizeof(int));
+        int *inds = (int *)calloc(resized.y.rows, sizeof(int));
        for(z = 0; z < resized.y.rows; ++z){
            int index = max_index(resized.y.vals[z] + train.y.cols, divs*divs);
            inds[z] = index;
@ -205,7 +205,7 @@ void train_attention(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
        avg_cls_loss = avg_cls_loss*.9 + closs*.1;
        avg_att_loss = avg_att_loss*.9 + aloss*.1;

-        printf("%ld, %.3f: Att: %f, %f avg, Class: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, aloss, avg_att_loss, closs, avg_cls_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
+        printf("%ld, %.3f: Att: %f, %f avg, nclass: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, aloss, avg_att_loss, closs, avg_cls_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
        if(*net->seen/N > epoch){
            epoch = *net->seen/N;
            char buff[256];
@ -255,19 +255,19 @@ void validate_attention_single(char *datacfg, char *filename, char *weightfile)

    float avg_acc = 0;
    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
+    int *indexes = (int *)calloc(topk, sizeof(int));
    int divs = 4;
    int size = 2;
    int extra = 0;
-    float *avgs = calloc(classes, sizeof(float));
-    int *inds = calloc(divs*divs, sizeof(int));
+    float *avgs = (float*) calloc(classes, sizeof(float));
+    int *inds = (int*) calloc(divs*divs, sizeof(int));

    for(i = 0; i < m; ++i){
-        int class = -1;
+        int nclass = -1;
        char *path = paths[i];
        for(j = 0; j < classes; ++j){
            if(strstr(path, labels[j])){
-                class = j;
+                nclass = j;
                break;
            }
        }
@ -309,9 +309,9 @@ void validate_attention_single(char *datacfg, char *filename, char *weightfile)
        free_image(crop);
        top_k(pred, classes, topk, indexes);

-        if(indexes[0] == class) avg_acc += 1;
+        if(indexes[0] == nclass) avg_acc += 1;
        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
+            if(indexes[j] == nclass) avg_topk += 1;
        }

        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
@ -343,18 +343,18 @@ void validate_attention_multi(char *datacfg, char *filename, char *weightfile)

    float avg_acc = 0;
    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
+    int *indexes = (int *)calloc(topk, sizeof(int));

    for(i = 0; i < m; ++i){
-        int class = -1;
+        int nclass = -1;
        char *path = paths[i];
        for(j = 0; j < classes; ++j){
            if(strstr(path, labels[j])){
-                class = j;
+                nclass = j;
                break;
            }
        }
-        float *pred = calloc(classes, sizeof(float));
+        float *pred = (float*) calloc(classes, sizeof(float));
        image im = load_image_color(paths[i], 0, 0);
        for(j = 0; j < nscales; ++j){
            image r = resize_min(im, scales[j]);
@ -370,9 +370,9 @@ void validate_attention_multi(char *datacfg, char *filename, char *weightfile)
        free_image(im);
        top_k(pred, classes, topk, indexes);
        free(pred);
-        if(indexes[0] == class) avg_acc += 1;
+        if(indexes[0] == nclass) avg_acc += 1;
        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
+            if(indexes[j] == nclass) avg_topk += 1;
        }

        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
@ -394,7 +394,7 @@ void predict_attention(char *datacfg, char *cfgfile, char *weightfile, char *fil
    int i = 0;
    char **names = get_labels(name_list);
    clock_t time;
-    int *indexes = calloc(top, sizeof(int));
+    int *indexes = (int *)calloc(top, sizeof(int));
    char buff[256];
    char *input = buff;
    while(1){
--- a/examples/captcha.cpp
+++ b/examples/captcha.cpp
--- a/examples/cifar.cpp
+++ b/examples/cifar.cpp
@ -120,8 +120,8 @@ void test_cifar_multi(char *filename, char *weightfile)
        axpy_cpu(10, 1, p, 1, pred, 1);

        int index = max_index(pred, 10);
-        int class = max_index(test.y.vals[i], 10);
-        if(index == class) avg_acc += 1;
+        int nclass = max_index(test.y.vals[i], 10);
+        if(index == nclass) avg_acc += 1;
        free_image(im);
        printf("%4d: %.2f%%\n", i, 100.*avg_acc/(i+1));
    }
@ -154,16 +154,16 @@ char *labels[] = {"airplane","automobile","bird","cat","deer","dog","frog","hors
    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
    for(i = 0; i < train.X.rows; ++i){
        image im = float_to_image(32, 32, 3, train.X.vals[i]);
-        int class = max_index(train.y.vals[i], 10);
+        int nclass = max_index(train.y.vals[i], 10);
        char buff[256];
-        sprintf(buff, "data/cifar/train/%d_%s",i,labels[class]);
+        sprintf(buff, "data/cifar/train/%d_%s",i,labels[nclass]);
        save_image_options(im, buff, PNG, 0);
    }
    for(i = 0; i < test.X.rows; ++i){
        image im = float_to_image(32, 32, 3, test.X.vals[i]);
-        int class = max_index(test.y.vals[i], 10);
+        int nclass = max_index(test.y.vals[i], 10);
        char buff[256];
-        sprintf(buff, "data/cifar/test/%d_%s",i,labels[class]);
+        sprintf(buff, "data/cifar/test/%d_%s",i,labels[nclass]);
        save_image_options(im, buff, PNG, 0);
    }
 }
--- a/examples/classifier.cpp
+++ b/examples/classifier.cpp
@ -5,7 +5,7 @@

 float *get_regression_values(char **labels, int n)
 {
-    float *v = calloc(n, sizeof(float));
+    float *v = (float*) calloc(n, sizeof(float));
    int i;
    for(i = 0; i < n; ++i){
        char *p = strchr(labels[i], ' ');
@ -23,7 +23,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
    printf("%d\n", ngpus);
-    network **nets = calloc(ngpus, sizeof(network*));
+    network **nets = (network **)calloc(ngpus, sizeof(network*));

    srand(time(0));
    int seed = rand();
@ -254,14 +254,14 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile)

    float avg_acc = 0;
    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
+    int *indexes = (int *)calloc(topk, sizeof(int));

    for(i = 0; i < m; ++i){
-        int class = -1;
+        int nclass = -1;
        char *path = paths[i];
        for(j = 0; j < classes; ++j){
            if(strstr(path, labels[j])){
-                class = j;
+                nclass = j;
                break;
            }
        }
@ -281,7 +281,7 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
        images[7] = crop_image(im, 0, 0, w, h);
        images[8] = crop_image(im, -shift, shift, w, h);
        images[9] = crop_image(im, shift, shift, w, h);
-        float *pred = calloc(classes, sizeof(float));
+        float *pred = (float*) calloc(classes, sizeof(float));
        for(j = 0; j < 10; ++j){
            float *p = network_predict(net, images[j].data);
            if(net->hierarchy) hierarchy_predictions(p, net->outputs, net->hierarchy, 1, 1);
@ -291,9 +291,9 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
        free_image(im);
        top_k(pred, classes, topk, indexes);
        free(pred);
-        if(indexes[0] == class) avg_acc += 1;
+        if(indexes[0] == nclass) avg_acc += 1;
        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
+            if(indexes[j] == nclass) avg_topk += 1;
        }

        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
@ -323,15 +323,15 @@ void validate_classifier_full(char *datacfg, char *filename, char *weightfile)

    float avg_acc = 0;
    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
+    int *indexes = (int *)calloc(topk, sizeof(int));

    int size = net->w;
    for(i = 0; i < m; ++i){
-        int class = -1;
+        int nclass = -1;
        char *path = paths[i];
        for(j = 0; j < classes; ++j){
            if(strstr(path, labels[j])){
-                class = j;
+                nclass = j;
                break;
            }
        }
@ -348,9 +348,9 @@ void validate_classifier_full(char *datacfg, char *filename, char *weightfile)
        free_image(resized);
        top_k(pred, classes, topk, indexes);

-        if(indexes[0] == class) avg_acc += 1;
+        if(indexes[0] == nclass) avg_acc += 1;
        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
+            if(indexes[j] == nclass) avg_topk += 1;
        }

        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
@ -383,14 +383,14 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile)

    float avg_acc = 0;
    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
+    int *indexes = (int *)calloc(topk, sizeof(int));

    for(i = 0; i < m; ++i){
-        int class = -1;
+        int nclass = -1;
        char *path = paths[i];
        for(j = 0; j < classes; ++j){
            if(strstr(path, labels[j])){
-                class = j;
+                nclass = j;
                break;
            }
        }
@ -407,12 +407,12 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile)
        free_image(crop);
        top_k(pred, classes, topk, indexes);

-        if(indexes[0] == class) avg_acc += 1;
+        if(indexes[0] == nclass) avg_acc += 1;
        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
+            if(indexes[j] == nclass) avg_topk += 1;
        }

-        printf("%s, %d, %f, %f, \n", paths[i], class, pred[0], pred[1]);
+        printf("%s, %d, %f, %f, \n", paths[i], nclass, pred[0], pred[1]);
        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
    }
 }
@ -443,18 +443,18 @@ void validate_classifier_multi(char *datacfg, char *cfg, char *weights)

    float avg_acc = 0;
    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
+    int *indexes = (int *)calloc(topk, sizeof(int));

    for(i = 0; i < m; ++i){
-        int class = -1;
+        int nclass = -1;
        char *path = paths[i];
        for(j = 0; j < classes; ++j){
            if(strstr(path, labels[j])){
-                class = j;
+                nclass = j;
                break;
            }
        }
-        float *pred = calloc(classes, sizeof(float));
+        float *pred = (float*) calloc(classes, sizeof(float));
        image im = load_image_color(paths[i], 0, 0);
        for(j = 0; j < nscales; ++j){
            image r = resize_max(im, scales[j]);
@ -470,9 +470,9 @@ void validate_classifier_multi(char *datacfg, char *cfg, char *weights)
        free_image(im);
        top_k(pred, classes, topk, indexes);
        free(pred);
-        if(indexes[0] == class) avg_acc += 1;
+        if(indexes[0] == nclass) avg_acc += 1;
        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
+            if(indexes[j] == nclass) avg_topk += 1;
        }

        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
@ -494,7 +494,7 @@ void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filena
    int i = 0;
    char **names = get_labels(name_list);
    clock_t time;
-    int *indexes = calloc(top, sizeof(int));
+    int *indexes = (int *)calloc(top, sizeof(int));
    char buff[256];
    char *input = buff;
    while(1){
@ -572,7 +572,7 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
    int i = 0;
    char **names = get_labels(name_list);
    clock_t time;
-    int *indexes = calloc(top, sizeof(int));
+    int *indexes = (int *)calloc(top, sizeof(int));
    char buff[256];
    char *input = buff;
    while(1){
@ -662,7 +662,7 @@ void csv_classifier(char *datacfg, char *cfgfile, char *weightfile)
    char **paths = (char **)list_to_array(plist);
    int m = plist->size;
    free_list(plist);
-    int *indexes = calloc(top, sizeof(int));
+    int *indexes = (int *)calloc(top, sizeof(int));

    for(i = 0; i < m; ++i){
        double time = what_time_is_it_now();
@ -813,7 +813,7 @@ void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_i
    char *name_list = option_find_str(options, "names", 0);
    char **names = get_labels(name_list);

-    int *indexes = calloc(top, sizeof(int));
+    int *indexes = (int *)calloc(top, sizeof(int));

    if(!cap) error("Couldn't connect to webcam.\n");
    //cvNamedWindow("Threat", CV_WINDOW_NORMAL); 
@ -935,7 +935,7 @@ void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
    char *name_list = option_find_str(options, "names", 0);
    char **names = get_labels(name_list);

-    int *indexes = calloc(top, sizeof(int));
+    int *indexes = (int *)calloc(top, sizeof(int));

    if(!cap) error("Couldn't connect to webcam.\n");
    float fps = 0;
@ -1005,7 +1005,7 @@ void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_ind
    char *name_list = option_find_str(options, "names", label_list);
    char **names = get_labels(name_list);

-    int *indexes = calloc(top, sizeof(int));
+    int *indexes = (int *)calloc(top, sizeof(int));

    if(!cap) error("Couldn't connect to webcam.\n");
    float fps = 0;
--- a/examples/coco.cpp
+++ b/examples/coco.cpp
@ -155,11 +155,11 @@ void validate_coco(char *cfg, char *weights)
    float iou_thresh = .5;

    int nthreads = 8;
-    image *val = calloc(nthreads, sizeof(image));
-    image *val_resized = calloc(nthreads, sizeof(image));
-    image *buf = calloc(nthreads, sizeof(image));
-    image *buf_resized = calloc(nthreads, sizeof(image));
-    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
+    image *val = (image *)calloc(nthreads, sizeof(image));
+    image *val_resized = (image *)calloc(nthreads, sizeof(image));
+    image *buf = (image *)calloc(nthreads, sizeof(image));
+    image *buf_resized = (image *)calloc(nthreads, sizeof(image));
+    pthread_t *thr = (pthread_t *)calloc(nthreads, sizeof(pthread_t));

    load_args args = {0};
    args.w = net->w;
@ -225,7 +225,7 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
    int side = l.side;

    int j, k;
-    FILE **fps = calloc(classes, sizeof(FILE *));
+    FILE **fps = (FILE **)calloc(classes, sizeof(FILE *));
    for(j = 0; j < classes; ++j){
        char buff[1024];
        snprintf(buff, 1024, "%s%s.txt", base, coco_classes[j]);
--- a/examples/darknet.cpp
+++ b/examples/darknet.cpp
@ -5,101 +5,117 @@
 #include <stdio.h>

 extern void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top);
-extern void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char *outfile, int fullscreen);
+
+extern void
+test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh,
+              char *outfile, int fullscreen);
+
 extern void run_yolo(int argc, char **argv);
+
 extern void run_detector(int argc, char **argv);
+
 extern void run_coco(int argc, char **argv);
+
 extern void run_nightmare(int argc, char **argv);
+
 extern void run_classifier(int argc, char **argv);
+
 extern void run_regressor(int argc, char **argv);
+
 extern void run_segmenter(int argc, char **argv);
+
 extern void run_isegmenter(int argc, char **argv);
+
 extern void run_char_rnn(int argc, char **argv);
+
 extern void run_tag(int argc, char **argv);
+
 extern void run_cifar(int argc, char **argv);
+
 extern void run_go(int argc, char **argv);
+
 extern void run_art(int argc, char **argv);
+
 extern void run_super(int argc, char **argv);
+
 extern void run_lsd(int argc, char **argv);

-void average(int argc, char *argv[])
-{
+void average(int argc, char *argv[]) {
    char *cfgfile = argv[2];
    char *outfile = argv[3];
    gpu_index = -1;
    network *net = parse_network_cfg(cfgfile);
    network *sum = parse_network_cfg(cfgfile);

-    char *weightfile = argv[4];   
+    char *weightfile = argv[4];
    load_weights(sum, weightfile);

    int i, j;
    int n = argc - 5;
-    for(i = 0; i < n; ++i){
-        weightfile = argv[i+5];   
+    for (i = 0; i < n; ++i) {
+        weightfile = argv[i + 5];
        load_weights(net, weightfile);
-        for(j = 0; j < net->n; ++j){
+        for (j = 0; j < net->n; ++j) {
            layer l = net->layers[j];
            layer out = sum->layers[j];
-            if(l.type == CONVOLUTIONAL){
-                int num = l.n*l.c*l.size*l.size;
+            if (l.type == CONVOLUTIONAL) {
+                int num = l.n * l.c * l.size * l.size;
                axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1);
                axpy_cpu(num, 1, l.weights, 1, out.weights, 1);
-                if(l.batch_normalize){
+                if (l.batch_normalize) {
                    axpy_cpu(l.n, 1, l.scales, 1, out.scales, 1);
                    axpy_cpu(l.n, 1, l.rolling_mean, 1, out.rolling_mean, 1);
                    axpy_cpu(l.n, 1, l.rolling_variance, 1, out.rolling_variance, 1);
                }
            }
-            if(l.type == CONNECTED){
+            if (l.type == CONNECTED) {
                axpy_cpu(l.outputs, 1, l.biases, 1, out.biases, 1);
-                axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, out.weights, 1);
+                axpy_cpu(l.outputs * l.inputs, 1, l.weights, 1, out.weights, 1);
            }
        }
    }
-    n = n+1;
-    for(j = 0; j < net->n; ++j){
+    n = n + 1;
+    for (j = 0; j < net->n; ++j) {
        layer l = sum->layers[j];
-        if(l.type == CONVOLUTIONAL){
-            int num = l.n*l.c*l.size*l.size;
-            scal_cpu(l.n, 1./n, l.biases, 1);
-            scal_cpu(num, 1./n, l.weights, 1);
-                if(l.batch_normalize){
-                    scal_cpu(l.n, 1./n, l.scales, 1);
-                    scal_cpu(l.n, 1./n, l.rolling_mean, 1);
-                    scal_cpu(l.n, 1./n, l.rolling_variance, 1);
-                }
+        if (l.type == CONVOLUTIONAL) {
+            int num = l.n * l.c * l.size * l.size;
+            scal_cpu(l.n, 1. / n, l.biases, 1);
+            scal_cpu(num, 1. / n, l.weights, 1);
+            if (l.batch_normalize) {
+                scal_cpu(l.n, 1. / n, l.scales, 1);
+                scal_cpu(l.n, 1. / n, l.rolling_mean, 1);
+                scal_cpu(l.n, 1. / n, l.rolling_variance, 1);
+            }
        }
-        if(l.type == CONNECTED){
-            scal_cpu(l.outputs, 1./n, l.biases, 1);
-            scal_cpu(l.outputs*l.inputs, 1./n, l.weights, 1);
+        if (l.type == CONNECTED) {
+            scal_cpu(l.outputs, 1. / n, l.biases, 1);
+            scal_cpu(l.outputs * l.inputs, 1. / n, l.weights, 1);
        }
    }
    save_weights(sum, outfile);
 }

-long numops(network *net)
-{
+long numops(network *net) {
    int i;
    long ops = 0;
-    for(i = 0; i < net->n; ++i){
+    for (i = 0; i < net->n; ++i) {
        layer l = net->layers[i];
-        if(l.type == CONVOLUTIONAL){
-            ops += 2l * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w;
-        } else if(l.type == CONNECTED){
+        if (l.type == CONVOLUTIONAL) {
+            ops += 2l * l.n * l.size * l.size * l.c / l.groups * l.out_h * l.out_w;
+        } else if (l.type == CONNECTED) {
            ops += 2l * l.inputs * l.outputs;
-        } else if (l.type == RNN){
+        } else if (l.type == RNN) {
            ops += 2l * l.input_layer->inputs * l.input_layer->outputs;
            ops += 2l * l.self_layer->inputs * l.self_layer->outputs;
            ops += 2l * l.output_layer->inputs * l.output_layer->outputs;
-        } else if (l.type == GRU){
+        } else if (l.type == GRU) {
            ops += 2l * l.uz->inputs * l.uz->outputs;
            ops += 2l * l.uh->inputs * l.uh->outputs;
            ops += 2l * l.ur->inputs * l.ur->outputs;
            ops += 2l * l.wz->inputs * l.wz->outputs;
            ops += 2l * l.wh->inputs * l.wh->outputs;
            ops += 2l * l.wr->inputs * l.wr->outputs;
-        } else if (l.type == LSTM){
+        } else if (l.type == LSTM) {
            ops += 2l * l.uf->inputs * l.uf->outputs;
            ops += 2l * l.ui->inputs * l.ui->outputs;
            ops += 2l * l.ug->inputs * l.ug->outputs;
@ -113,67 +129,63 @@ long numops(network *net)
    return ops;
 }

-void speed(char *cfgfile, int tics)
-{
+void speed(char *cfgfile, int tics) {
    if (tics == 0) tics = 1000;
    network *net = parse_network_cfg(cfgfile);
    set_batch_network(net, 1);
    int i;
-    double time=what_time_is_it_now();
-    image im = make_image(net->w, net->h, net->c*net->batch);
-    for(i = 0; i < tics; ++i){
+    double time = what_time_is_it_now();
+    image im = make_image(net->w, net->h, net->c * net->batch);
+    for (i = 0; i < tics; ++i) {
        network_predict(net, im.data);
    }
    double t = what_time_is_it_now() - time;
    long ops = numops(net);
    printf("\n%d evals, %f Seconds\n", tics, t);
-    printf("Floating Point Operations: %.2f Bn\n", (float)ops/1000000000.);
-    printf("FLOPS: %.2f Bn\n", (float)ops/1000000000.*tics/t);
-    printf("Speed: %f sec/eval\n", t/tics);
-    printf("Speed: %f Hz\n", tics/t);
+    printf("Floating Point Operations: %.2f Bn\n", (float) ops / 1000000000.);
+    printf("FLOPS: %.2f Bn\n", (float) ops / 1000000000. * tics / t);
+    printf("Speed: %f sec/eval\n", t / tics);
+    printf("Speed: %f Hz\n", tics / t);
 }

-void operations(char *cfgfile)
-{
+void operations(char *cfgfile) {
    gpu_index = -1;
    network *net = parse_network_cfg(cfgfile);
    long ops = numops(net);
    printf("Floating Point Operations: %ld\n", ops);
-    printf("Floating Point Operations: %.2f Bn\n", (float)ops/1000000000.);
+    printf("Floating Point Operations: %.2f Bn\n", (float) ops / 1000000000.);
 }

-void oneoff(char *cfgfile, char *weightfile, char *outfile)
-{
+void oneoff(char *cfgfile, char *weightfile, char *outfile) {
    gpu_index = -1;
    network *net = parse_network_cfg(cfgfile);
    int oldn = net->layers[net->n - 2].n;
    int c = net->layers[net->n - 2].c;
-    scal_cpu(oldn*c, .1, net->layers[net->n - 2].weights, 1);
+    scal_cpu(oldn * c, .1, net->layers[net->n - 2].weights, 1);
    scal_cpu(oldn, 0, net->layers[net->n - 2].biases, 1);
    net->layers[net->n - 2].n = 11921;
    net->layers[net->n - 2].biases += 5;
-    net->layers[net->n - 2].weights += 5*c;
-    if(weightfile){
+    net->layers[net->n - 2].weights += 5 * c;
+    if (weightfile) {
        load_weights(net, weightfile);
    }
    net->layers[net->n - 2].biases -= 5;
-    net->layers[net->n - 2].weights -= 5*c;
+    net->layers[net->n - 2].weights -= 5 * c;
    net->layers[net->n - 2].n = oldn;
    printf("%d\n", oldn);
    layer l = net->layers[net->n - 2];
-    copy_cpu(l.n/3, l.biases, 1, l.biases +   l.n/3, 1);
-    copy_cpu(l.n/3, l.biases, 1, l.biases + 2*l.n/3, 1);
-    copy_cpu(l.n/3*l.c, l.weights, 1, l.weights +   l.n/3*l.c, 1);
-    copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + 2*l.n/3*l.c, 1);
+    copy_cpu(l.n / 3, l.biases, 1, l.biases + l.n / 3, 1);
+    copy_cpu(l.n / 3, l.biases, 1, l.biases + 2 * l.n / 3, 1);
+    copy_cpu(l.n / 3 * l.c, l.weights, 1, l.weights + l.n / 3 * l.c, 1);
+    copy_cpu(l.n / 3 * l.c, l.weights, 1, l.weights + 2 * l.n / 3 * l.c, 1);
    *net->seen = 0;
    save_weights(net, outfile);
 }

-void oneoff2(char *cfgfile, char *weightfile, char *outfile, int l)
-{
+void oneoff2(char *cfgfile, char *weightfile, char *outfile, int l) {
    gpu_index = -1;
    network *net = parse_network_cfg(cfgfile);
-    if(weightfile){
+    if (weightfile) {
        load_weights_upto(net, weightfile, 0, net->n);
        load_weights_upto(net, weightfile, l, net->n);
    }
@ -181,25 +193,23 @@ void oneoff2(char *cfgfile, char *weightfile, char *outfile, int l)
    save_weights_upto(net, outfile, net->n);
 }

-void partial(char *cfgfile, char *weightfile, char *outfile, int max)
-{
+void partial(char *cfgfile, char *weightfile, char *outfile, int max) {
    gpu_index = -1;
    network *net = load_network(cfgfile, weightfile, 1);
    save_weights_upto(net, outfile, max);
 }

-void print_weights(char *cfgfile, char *weightfile, int n)
-{
+void print_weights(char *cfgfile, char *weightfile, int n) {
    gpu_index = -1;
    network *net = load_network(cfgfile, weightfile, 1);
    layer l = net->layers[n];
    int i, j;
    //printf("[");
-    for(i = 0; i < l.n; ++i){
+    for (i = 0; i < l.n; ++i) {
        //printf("[");
-        for(j = 0; j < l.size*l.size*l.c; ++j){
+        for (j = 0; j < l.size * l.size * l.c; ++j) {
            //if(j > 0) printf(",");
-            printf("%g ", l.weights[i*l.size*l.size*l.c + j]);
+            printf("%g ", l.weights[i * l.size * l.size * l.c + j]);
        }
        printf("\n");
        //printf("]%s\n", (i == l.n-1)?"":",");
@ -207,14 +217,13 @@ void print_weights(char *cfgfile, char *weightfile, int n)
    //printf("]");
 }

-void rescale_net(char *cfgfile, char *weightfile, char *outfile)
-{
+void rescale_net(char *cfgfile, char *weightfile, char *outfile) {
    gpu_index = -1;
    network *net = load_network(cfgfile, weightfile, 0);
    int i;
-    for(i = 0; i < net->n; ++i){
+    for (i = 0; i < net->n; ++i) {
        layer l = net->layers[i];
-        if(l.type == CONVOLUTIONAL){
+        if (l.type == CONVOLUTIONAL) {
            rescale_weights(l, 2, -.5);
            break;
        }
@ -222,14 +231,13 @@ void rescale_net(char *cfgfile, char *weightfile, char *outfile)
    save_weights(net, outfile);
 }

-void rgbgr_net(char *cfgfile, char *weightfile, char *outfile)
-{
+void rgbgr_net(char *cfgfile, char *weightfile, char *outfile) {
    gpu_index = -1;
    network *net = load_network(cfgfile, weightfile, 0);
    int i;
-    for(i = 0; i < net->n; ++i){
+    for (i = 0; i < net->n; ++i) {
        layer l = net->layers[i];
-        if(l.type == CONVOLUTIONAL){
+        if (l.type == CONVOLUTIONAL) {
            rgbgr_weights(l);
            break;
        }
@ -237,8 +245,7 @@ void rgbgr_net(char *cfgfile, char *weightfile, char *outfile)
    save_weights(net, outfile);
 }

-void reset_normalize_net(char *cfgfile, char *weightfile, char *outfile)
-{
+void reset_normalize_net(char *cfgfile, char *weightfile, char *outfile) {
    gpu_index = -1;
    network *net = load_network(cfgfile, weightfile, 0);
    int i;
@ -262,27 +269,25 @@ void reset_normalize_net(char *cfgfile, char *weightfile, char *outfile)
    save_weights(net, outfile);
 }

-layer normalize_layer(layer l, int n)
-{
+layer normalize_layer(layer l, int n) {
    int j;
-    l.batch_normalize=1;
-    l.scales = calloc(n, sizeof(float));
-    for(j = 0; j < n; ++j){
+    l.batch_normalize = 1;
+    l.scales = (float *) calloc(n, sizeof(float));
+    for (j = 0; j < n; ++j) {
        l.scales[j] = 1;
    }
-    l.rolling_mean = calloc(n, sizeof(float));
-    l.rolling_variance = calloc(n, sizeof(float));
+    l.rolling_mean = (float *) calloc(n, sizeof(float));
+    l.rolling_variance = (float *) calloc(n, sizeof(float));
    return l;
 }

-void normalize_net(char *cfgfile, char *weightfile, char *outfile)
-{
+void normalize_net(char *cfgfile, char *weightfile, char *outfile) {
    gpu_index = -1;
    network *net = load_network(cfgfile, weightfile, 0);
    int i;
-    for(i = 0; i < net->n; ++i){
+    for (i = 0; i < net->n; ++i) {
        layer l = net->layers[i];
-        if(l.type == CONVOLUTIONAL && !l.batch_normalize){
+        if (l.type == CONVOLUTIONAL && !l.batch_normalize) {
            net->layers[i] = normalize_layer(l, l.n);
        }
        if (l.type == CONNECTED && !l.batch_normalize) {
@ -295,14 +300,13 @@ void normalize_net(char *cfgfile, char *weightfile, char *outfile)
            *l.state_z_layer = normalize_layer(*l.state_z_layer, l.state_z_layer->outputs);
            *l.state_r_layer = normalize_layer(*l.state_r_layer, l.state_r_layer->outputs);
            *l.state_h_layer = normalize_layer(*l.state_h_layer, l.state_h_layer->outputs);
-            net->layers[i].batch_normalize=1;
+            net->layers[i].batch_normalize = 1;
        }
    }
    save_weights(net, outfile);
 }

-void statistics_net(char *cfgfile, char *weightfile)
-{
+void statistics_net(char *cfgfile, char *weightfile) {
    gpu_index = -1;
    network *net = load_network(cfgfile, weightfile, 0);
    int i;
@ -331,8 +335,7 @@ void statistics_net(char *cfgfile, char *weightfile)
    }
 }

-void denormalize_net(char *cfgfile, char *weightfile, char *outfile)
-{
+void denormalize_net(char *cfgfile, char *weightfile, char *outfile) {
    gpu_index = -1;
    network *net = load_network(cfgfile, weightfile, 0);
    int i;
@ -340,11 +343,11 @@ void denormalize_net(char *cfgfile, char *weightfile, char *outfile)
        layer l = net->layers[i];
        if ((l.type == DECONVOLUTIONAL || l.type == CONVOLUTIONAL) && l.batch_normalize) {
            denormalize_convolutional_layer(l);
-            net->layers[i].batch_normalize=0;
+            net->layers[i].batch_normalize = 0;
        }
        if (l.type == CONNECTED && l.batch_normalize) {
            denormalize_connected_layer(l);
-            net->layers[i].batch_normalize=0;
+            net->layers[i].batch_normalize = 0;
        }
        if (l.type == GRU && l.batch_normalize) {
            denormalize_connected_layer(*l.input_z_layer);
@ -359,28 +362,27 @@ void denormalize_net(char *cfgfile, char *weightfile, char *outfile)
            l.state_z_layer->batch_normalize = 0;
            l.state_r_layer->batch_normalize = 0;
            l.state_h_layer->batch_normalize = 0;
-            net->layers[i].batch_normalize=0;
+            net->layers[i].batch_normalize = 0;
        }
    }
    save_weights(net, outfile);
 }

-void mkimg(char *cfgfile, char *weightfile, int h, int w, int num, char *prefix)
-{
+void mkimg(char *cfgfile, char *weightfile, int h, int w, int num, char *prefix) {
    network *net = load_network(cfgfile, weightfile, 0);
    image *ims = get_weights(net->layers[0]);
    int n = net->layers[0].n;
    int z;
-    for(z = 0; z < num; ++z){
+    for (z = 0; z < num; ++z) {
        image im = make_image(h, w, 3);
        fill_image(im, .5);
        int i;
-        for(i = 0; i < 100; ++i){
-            image r = copy_image(ims[rand()%n]);
-            rotate_image_cw(r, rand()%4);
+        for (i = 0; i < 100; ++i) {
+            image r = copy_image(ims[rand() % n]);
+            rotate_image_cw(r, rand() % 4);
            random_distort_image(r, 1, 1.5, 1.5);
-            int dx = rand()%(w-r.w);
-            int dy = rand()%(h-r.h);
+            int dx = rand() % (w - r.w);
+            int dy = rand() % (h - r.h);
            ghost_image(r, im, dx, dy);
            free_image(r);
        }
@ -391,23 +393,22 @@ void mkimg(char *cfgfile, char *weightfile, int h, int w, int num, char *prefix)
    }
 }

-void visualize(char *cfgfile, char *weightfile)
-{
+void visualize(char *cfgfile, char *weightfile) {
    network *net = load_network(cfgfile, weightfile, 0);
    visualize_network(net);
 }

-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
+    printf("argv is %s\n", argv[1]);
    //test_resize("data/bad.jpg");
    //test_box();
    //test_convolutional_layer();
-    if(argc < 2){
+    if (argc < 2) {
        fprintf(stderr, "usage: %s <function>\n", argv[0]);
        return 0;
    }
    gpu_index = find_int_arg(argc, argv, "-i", 0);
-    if(find_arg(argc, argv, "-nogpu")) {
+    if (find_arg(argc, argv, "-nogpu")) {
        gpu_index = -1;
    }

@ -419,81 +420,83 @@ int main(int argc, char **argv)
    }
 #endif

-    if (0 == strcmp(argv[1], "average")){
+    printf("gpu_index is %d\n", gpu_index);
+
+    if (0 == strcmp(argv[1], "average")) {
        average(argc, argv);
-    } else if (0 == strcmp(argv[1], "yolo")){
+    } else if (0 == strcmp(argv[1], "yolo")) {
        run_yolo(argc, argv);
-    } else if (0 == strcmp(argv[1], "super")){
+    } else if (0 == strcmp(argv[1], "super")) {
        run_super(argc, argv);
-    } else if (0 == strcmp(argv[1], "lsd")){
+    } else if (0 == strcmp(argv[1], "lsd")) {
        run_lsd(argc, argv);
-    } else if (0 == strcmp(argv[1], "detector")){
+    } else if (0 == strcmp(argv[1], "detector")) {
        run_detector(argc, argv);
-    } else if (0 == strcmp(argv[1], "detect")){
+    } else if (0 == strcmp(argv[1], "detect")) {
        float thresh = find_float_arg(argc, argv, "-thresh", .5);
-        char *filename = (argc > 4) ? argv[4]: 0;
+        char *filename = (argc > 4) ? argv[4] : 0;
        char *outfile = find_char_arg(argc, argv, "-out", 0);
        int fullscreen = find_arg(argc, argv, "-fullscreen");
        test_detector("cfg/coco.data", argv[2], argv[3], filename, thresh, .5, outfile, fullscreen);
-    } else if (0 == strcmp(argv[1], "cifar")){
+    } else if (0 == strcmp(argv[1], "cifar")) {
        run_cifar(argc, argv);
-    } else if (0 == strcmp(argv[1], "go")){
+    } else if (0 == strcmp(argv[1], "go")) {
        run_go(argc, argv);
-    } else if (0 == strcmp(argv[1], "rnn")){
+    } else if (0 == strcmp(argv[1], "rnn")) {
        run_char_rnn(argc, argv);
-    } else if (0 == strcmp(argv[1], "coco")){
+    } else if (0 == strcmp(argv[1], "coco")) {
        run_coco(argc, argv);
-    } else if (0 == strcmp(argv[1], "classify")){
+    } else if (0 == strcmp(argv[1], "classify")) {
        predict_classifier("cfg/imagenet1k.data", argv[2], argv[3], argv[4], 5);
-    } else if (0 == strcmp(argv[1], "classifier")){
+    } else if (0 == strcmp(argv[1], "classifier")) {
        run_classifier(argc, argv);
-    } else if (0 == strcmp(argv[1], "regressor")){
+    } else if (0 == strcmp(argv[1], "regressor")) {
        run_regressor(argc, argv);
-    } else if (0 == strcmp(argv[1], "isegmenter")){
+    } else if (0 == strcmp(argv[1], "isegmenter")) {
        run_isegmenter(argc, argv);
-    } else if (0 == strcmp(argv[1], "segmenter")){
+    } else if (0 == strcmp(argv[1], "segmenter")) {
        run_segmenter(argc, argv);
-    } else if (0 == strcmp(argv[1], "art")){
+    } else if (0 == strcmp(argv[1], "art")) {
        run_art(argc, argv);
-    } else if (0 == strcmp(argv[1], "tag")){
+    } else if (0 == strcmp(argv[1], "tag")) {
        run_tag(argc, argv);
-    } else if (0 == strcmp(argv[1], "3d")){
+    } else if (0 == strcmp(argv[1], "3d")) {
        composite_3d(argv[2], argv[3], argv[4], (argc > 5) ? atof(argv[5]) : 0);
-    } else if (0 == strcmp(argv[1], "test")){
+    } else if (0 == strcmp(argv[1], "test")) {
        test_resize(argv[2]);
-    } else if (0 == strcmp(argv[1], "nightmare")){
+    } else if (0 == strcmp(argv[1], "nightmare")) {
        run_nightmare(argc, argv);
-    } else if (0 == strcmp(argv[1], "rgbgr")){
+    } else if (0 == strcmp(argv[1], "rgbgr")) {
        rgbgr_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "reset")){
+    } else if (0 == strcmp(argv[1], "reset")) {
        reset_normalize_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "denormalize")){
+    } else if (0 == strcmp(argv[1], "denormalize")) {
        denormalize_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "statistics")){
+    } else if (0 == strcmp(argv[1], "statistics")) {
        statistics_net(argv[2], argv[3]);
-    } else if (0 == strcmp(argv[1], "normalize")){
+    } else if (0 == strcmp(argv[1], "normalize")) {
        normalize_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "rescale")){
+    } else if (0 == strcmp(argv[1], "rescale")) {
        rescale_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "ops")){
+    } else if (0 == strcmp(argv[1], "ops")) {
        operations(argv[2]);
-    } else if (0 == strcmp(argv[1], "speed")){
+    } else if (0 == strcmp(argv[1], "speed")) {
        speed(argv[2], (argc > 3 && argv[3]) ? atoi(argv[3]) : 0);
-    } else if (0 == strcmp(argv[1], "oneoff")){
+    } else if (0 == strcmp(argv[1], "oneoff")) {
        oneoff(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "oneoff2")){
+    } else if (0 == strcmp(argv[1], "oneoff2")) {
        oneoff2(argv[2], argv[3], argv[4], atoi(argv[5]));
-    } else if (0 == strcmp(argv[1], "print")){
+    } else if (0 == strcmp(argv[1], "print")) {
        print_weights(argv[2], argv[3], atoi(argv[4]));
-    } else if (0 == strcmp(argv[1], "partial")){
+    } else if (0 == strcmp(argv[1], "partial")) {
        partial(argv[2], argv[3], argv[4], atoi(argv[5]));
-    } else if (0 == strcmp(argv[1], "average")){
+    } else if (0 == strcmp(argv[1], "average")) {
        average(argc, argv);
-    } else if (0 == strcmp(argv[1], "visualize")){
+    } else if (0 == strcmp(argv[1], "visualize")) {
        visualize(argv[2], (argc > 3) ? argv[3] : 0);
-    } else if (0 == strcmp(argv[1], "mkimg")){
+    } else if (0 == strcmp(argv[1], "mkimg")) {
        mkimg(argv[2], argv[3], atoi(argv[4]), atoi(argv[5]), atoi(argv[6]), argv[7]);
-    } else if (0 == strcmp(argv[1], "imtest")){
+    } else if (0 == strcmp(argv[1], "imtest")) {
        test_resize(argv[2]);
    } else {
        fprintf(stderr, "Not an option: %s\n", argv[1]);
--- a/examples/detector-scipy-opencv.py
+++ b/examples/detector-scipy-opencv.py
@ -1,56 +0,0 @@
-# Stupid python path shit.
-# Instead just add darknet.py to somewhere in your python path
-# OK actually that might not be a great idea, idk, work in progress
-# Use at your own risk. or don't, i don't care
-
-from scipy.misc import imread
-import cv2
-
-def array_to_image(arr):
-    arr = arr.transpose(2,0,1)
-    c = arr.shape[0]
-    h = arr.shape[1]
-    w = arr.shape[2]
-    arr = (arr/255.0).flatten()
-    data = dn.c_array(dn.c_float, arr)
-    im = dn.IMAGE(w,h,c,data)
-    return im
-
-def detect2(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
-    boxes = dn.make_boxes(net)
-    probs = dn.make_probs(net)
-    num =   dn.num_boxes(net)
-    dn.network_detect(net, image, thresh, hier_thresh, nms, boxes, probs)
-    res = []
-    for j in range(num):
-        for i in range(meta.classes):
-            if probs[j][i] > 0:
-                res.append((meta.names[i], probs[j][i], (boxes[j].x, boxes[j].y, boxes[j].w, boxes[j].h)))
-    res = sorted(res, key=lambda x: -x[1])
-    dn.free_ptrs(dn.cast(probs, dn.POINTER(dn.c_void_p)), num)
-    return res
-
-import sys, os
-sys.path.append(os.path.join(os.getcwd(),'python/'))
-
-import darknet as dn
-
-# Darknet
-net = dn.load_net("cfg/tiny-yolo.cfg", "tiny-yolo.weights", 0)
-meta = dn.load_meta("cfg/coco.data")
-r = dn.detect(net, meta, "data/dog.jpg")
-print r
-
-# scipy
-arr= imread('data/dog.jpg')
-im = array_to_image(arr)
-r = detect2(net, meta, im)
-print r
-
-# OpenCV
-arr = cv2.imread('data/dog.jpg')
-im = array_to_image(arr)
-dn.rgbgr_image(im)
-r = detect2(net, meta, im)
-print r
-
--- a/examples/detector.cpp
+++ b/examples/detector.cpp
@ -1,10 +1,12 @@
 #include "darknet.h"

-static int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};
+static int coco_ids[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28,
+                         31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                         56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84,
+                         85, 86, 87, 88, 89, 90};


-void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
-{
+void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear) {
    list *options = read_data_cfg(datacfg);
    char *train_images = option_find_str(options, "train", "data/train.list");
    char *backup_directory = option_find_str(options, "backup", "/backup/");
@ -13,12 +15,12 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
    float avg_loss = -1;
-    network **nets = calloc(ngpus, sizeof(network));
+    network **nets = (network **) calloc(ngpus, sizeof(network));

    srand(time(0));
    int seed = rand();
    int i;
-    for(i = 0; i < ngpus; ++i){
+    for (i = 0; i < ngpus; ++i) {
        srand(seed);
 #ifdef GPU
        cuda_set_device(gpus[i]);
@ -40,7 +42,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i

    list *plist = get_paths(train_images);
    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);

    load_args args = get_base_args(net);
    args.coords = l.coords;
@ -59,11 +61,11 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
    double time;
    int count = 0;
    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net->max_batches){
-        if(l.random && count++%10 == 0){
+    while (get_current_batch(net) < net->max_batches) {
+        if (l.random && count++ % 10 == 0) {
            printf("Resizing\n");
            int dim = (rand() % 10 + 10) * 32;
-            if (get_current_batch(net)+200 > net->max_batches) dim = 608;
+            if (get_current_batch(net) + 200 > net->max_batches) dim = 608;
            //int dim = (rand() % 4 + 16) * 32;
            printf("%d\n", dim);
            args.w = dim;
@ -74,13 +76,13 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
            free_data(train);
            load_thread = load_data(args);

-            #pragma omp parallel for
-            for(i = 0; i < ngpus; ++i){
+#pragma omp parallel for
+            for (i = 0; i < ngpus; ++i) {
                resize_network(nets[i], dim, dim);
            }
            net = nets[0];
        }
-        time=what_time_is_it_now();
+        time = what_time_is_it_now();
        pthread_join(load_thread, 0);
        train = buffer;
        load_thread = load_data(args);
@ -109,9 +111,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
           }
         */

-        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
+        printf("Loaded: %lf seconds\n", what_time_is_it_now() - time);

-        time=what_time_is_it_now();
+        time = what_time_is_it_now();
        float loss = 0;
 #ifdef GPU
        if(ngpus == 1){
@ -123,11 +125,12 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
        loss = train_network(net, train);
 #endif
        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
+        avg_loss = avg_loss * .9 + loss * .1;

        i = get_current_batch(net);
-        printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, i*imgs);
-        if(i%100==0){
+        printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss,
+               get_current_rate(net), what_time_is_it_now() - time, i * imgs);
+        if (i % 100 == 0) {
 #ifdef GPU
            if(ngpus != 1) sync_nets(nets, ngpus, 0);
 #endif
@ -135,7 +138,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
            sprintf(buff, "%s/%s.backup", backup_directory, base);
            save_weights(net, buff);
        }
-        if(i%10000==0 || (i < 1000 && i%100 == 0)){
+        if (i % 10000 == 0 || (i < 1000 && i % 100 == 0)) {
 #ifdef GPU
            if(ngpus != 1) sync_nets(nets, ngpus, 0);
 #endif
@ -154,23 +157,21 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
 }


-static int get_coco_image_id(char *filename)
-{
+static int get_coco_image_id(char *filename) {
    char *p = strrchr(filename, '/');
    char *c = strrchr(filename, '_');
-    if(c) p = c;
-    return atoi(p+1);
+    if (c) p = c;
+    return atoi(p + 1);
 }

-static void print_cocos(FILE *fp, char *image_path, detection *dets, int num_boxes, int classes, int w, int h)
-{
+static void print_cocos(FILE *fp, char *image_path, detection *dets, int num_boxes, int classes, int w, int h) {
    int i, j;
    int image_id = get_coco_image_id(image_path);
-    for(i = 0; i < num_boxes; ++i){
-        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
-        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
-        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
-        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
+    for (i = 0; i < num_boxes; ++i) {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;

        if (xmin < 0) xmin = 0;
        if (ymin < 0) ymin = 0;
@ -182,57 +183,58 @@ static void print_cocos(FILE *fp, char *image_path, detection *dets, int num_box
        float bw = xmax - xmin;
        float bh = ymax - ymin;

-        for(j = 0; j < classes; ++j){
-            if (dets[i].prob[j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
+        for (j = 0; j < classes; ++j) {
+            if (dets[i].prob[j])
+                fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n",
+                        image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
        }
    }
 }

-void print_detector_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h)
-{
+void print_detector_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h) {
    int i, j;
-    for(i = 0; i < total; ++i){
-        float xmin = dets[i].bbox.x - dets[i].bbox.w/2. + 1;
-        float xmax = dets[i].bbox.x + dets[i].bbox.w/2. + 1;
-        float ymin = dets[i].bbox.y - dets[i].bbox.h/2. + 1;
-        float ymax = dets[i].bbox.y + dets[i].bbox.h/2. + 1;
+    for (i = 0; i < total; ++i) {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2. + 1;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2. + 1;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2. + 1;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2. + 1;

        if (xmin < 1) xmin = 1;
        if (ymin < 1) ymin = 1;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;

-        for(j = 0; j < classes; ++j){
-            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
-                    xmin, ymin, xmax, ymax);
+        for (j = 0; j < classes; ++j) {
+            if (dets[i].prob[j])
+                fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
+                        xmin, ymin, xmax, ymax);
        }
    }
 }

-void print_imagenet_detections(FILE *fp, int id, detection *dets, int total, int classes, int w, int h)
-{
+void print_imagenet_detections(FILE *fp, int id, detection *dets, int total, int classes, int w, int h) {
    int i, j;
-    for(i = 0; i < total; ++i){
-        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
-        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
-        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
-        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
+    for (i = 0; i < total; ++i) {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;

        if (xmin < 0) xmin = 0;
        if (ymin < 0) ymin = 0;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;

-        for(j = 0; j < classes; ++j){
-            int class = j;
-            if (dets[i].prob[class]) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j+1, dets[i].prob[class],
-                    xmin, ymin, xmax, ymax);
+        for (j = 0; j < classes; ++j) {
+            int nclass = j;
+            if (dets[i].prob[nclass])
+                fprintf(fp, "%d %d %f %f %f %f %f\n", id, j + 1, dets[i].prob[nclass],
+                        xmin, ymin, xmax, ymax);
        }
    }
 }

-void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
-{
+void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char *outfile) {
    int j;
    list *options = read_data_cfg(datacfg);
    char *valid_images = option_find_str(options, "valid", "data/train.list");
@ -249,9 +251,9 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
    srand(time(0));

    list *plist = get_paths(valid_images);
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);

-    layer l = net->layers[net->n-1];
+    layer l = net->layers[net->n - 1];
    int classes = l.classes;

    char buff[1024];
@ -260,42 +262,42 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
    FILE **fps = 0;
    int coco = 0;
    int imagenet = 0;
-    if(0==strcmp(type, "coco")){
-        if(!outfile) outfile = "coco_results";
+    if (0 == strcmp(type, "coco")) {
+        if (!outfile) outfile = "coco_results";
        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
        fp = fopen(buff, "w");
        fprintf(fp, "[\n");
        coco = 1;
-    } else if(0==strcmp(type, "imagenet")){
-        if(!outfile) outfile = "imagenet-detection";
+    } else if (0 == strcmp(type, "imagenet")) {
+        if (!outfile) outfile = "imagenet-detection";
        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
        fp = fopen(buff, "w");
        imagenet = 1;
        classes = 200;
    } else {
-        if(!outfile) outfile = "comp4_det_test_";
-        fps = calloc(classes, sizeof(FILE *));
-        for(j = 0; j < classes; ++j){
+        if (!outfile) outfile = "comp4_det_test_";
+        fps = (FILE **) calloc(classes, sizeof(FILE * ));
+        for (j = 0; j < classes; ++j) {
            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
            fps[j] = fopen(buff, "w");
        }
    }

    int m = plist->size;
-    int i=0;
+    int i = 0;
    int t;

    float thresh = .005;
    float nms = .45;

    int nthreads = 4;
-    image *val = calloc(nthreads, sizeof(image));
-    image *val_resized = calloc(nthreads, sizeof(image));
-    image *buf = calloc(nthreads, sizeof(image));
-    image *buf_resized = calloc(nthreads, sizeof(image));
-    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
+    image *val = (image *) calloc(nthreads, sizeof(image));
+    image *val_resized = (image *) calloc(nthreads, sizeof(image));
+    image *buf = (image *) calloc(nthreads, sizeof(image));
+    image *buf_resized = (image *) calloc(nthreads, sizeof(image));
+    pthread_t *thr = (pthread_t *) calloc(nthreads, sizeof(pthread_t));

-    image input = make_image(net->w, net->h, net->c*2);
+    image input = make_image(net->w, net->h, net->c * 2);

    load_args args = {0};
    args.w = net->w;
@ -303,32 +305,32 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
    //args.type = IMAGE_DATA;
    args.type = LETTERBOX_DATA;

-    for(t = 0; t < nthreads; ++t){
-        args.path = paths[i+t];
+    for (t = 0; t < nthreads; ++t) {
+        args.path = paths[i + t];
        args.im = &buf[t];
        args.resized = &buf_resized[t];
        thr[t] = load_data_in_thread(args);
    }
    double start = what_time_is_it_now();
-    for(i = nthreads; i < m+nthreads; i += nthreads){
+    for (i = nthreads; i < m + nthreads; i += nthreads) {
        fprintf(stderr, "%d\n", i);
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            pthread_join(thr[t], 0);
            val[t] = buf[t];
            val_resized[t] = buf_resized[t];
        }
-        for(t = 0; t < nthreads && i+t < m; ++t){
-            args.path = paths[i+t];
+        for (t = 0; t < nthreads && i + t < m; ++t) {
+            args.path = paths[i + t];
            args.im = &buf[t];
            args.resized = &buf_resized[t];
            thr[t] = load_data_in_thread(args);
        }
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            char *path = paths[i+t-nthreads];
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
+            char *path = paths[i + t - nthreads];
            char *id = basecfg(path);
-            copy_cpu(net->w*net->h*net->c, val_resized[t].data, 1, input.data, 1);
+            copy_cpu(net->w * net->h * net->c, val_resized[t].data, 1, input.data, 1);
            flip_image(val_resized[t]);
-            copy_cpu(net->w*net->h*net->c, val_resized[t].data, 1, input.data + net->w*net->h*net->c, 1);
+            copy_cpu(net->w * net->h * net->c, val_resized[t].data, 1, input.data + net->w * net->h * net->c, 1);

            network_predict(net, input.data);
            int w = val[t].w;
@ -336,10 +338,10 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
            int num = 0;
            detection *dets = get_network_boxes(net, w, h, thresh, .5, map, 0, &num);
            if (nms) do_nms_sort(dets, num, classes, nms);
-            if (coco){
+            if (coco) {
                print_cocos(fp, path, dets, num, classes, w, h);
-            } else if (imagenet){
-                print_imagenet_detections(fp, i+t-nthreads+1, dets, num, classes, w, h);
+            } else if (imagenet) {
+                print_imagenet_detections(fp, i + t - nthreads + 1, dets, num, classes, w, h);
            } else {
                print_detector_detections(fps, id, dets, num, classes, w, h);
            }
@ -349,11 +351,11 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
            free_image(val_resized[t]);
        }
    }
-    for(j = 0; j < classes; ++j){
-        if(fps) fclose(fps[j]);
+    for (j = 0; j < classes; ++j) {
+        if (fps) fclose(fps[j]);
    }
-    if(coco){
-        fseek(fp, -2, SEEK_CUR); 
+    if (coco) {
+        fseek(fp, -2, SEEK_CUR);
        fprintf(fp, "\n]\n");
        fclose(fp);
    }
@ -361,8 +363,7 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
 }


-void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
-{
+void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile) {
    int j;
    list *options = read_data_cfg(datacfg);
    char *valid_images = option_find_str(options, "valid", "data/train.list");
@ -379,9 +380,9 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
    srand(time(0));

    list *plist = get_paths(valid_images);
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);

-    layer l = net->layers[net->n-1];
+    layer l = net->layers[net->n - 1];
    int classes = l.classes;

    char buff[1024];
@ -390,22 +391,22 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
    FILE **fps = 0;
    int coco = 0;
    int imagenet = 0;
-    if(0==strcmp(type, "coco")){
-        if(!outfile) outfile = "coco_results";
+    if (0 == strcmp(type, "coco")) {
+        if (!outfile) outfile = "coco_results";
        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
        fp = fopen(buff, "w");
        fprintf(fp, "[\n");
        coco = 1;
-    } else if(0==strcmp(type, "imagenet")){
-        if(!outfile) outfile = "imagenet-detection";
+    } else if (0 == strcmp(type, "imagenet")) {
+        if (!outfile) outfile = "imagenet-detection";
        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
        fp = fopen(buff, "w");
        imagenet = 1;
        classes = 200;
    } else {
-        if(!outfile) outfile = "comp4_det_test_";
-        fps = calloc(classes, sizeof(FILE *));
-        for(j = 0; j < classes; ++j){
+        if (!outfile) outfile = "comp4_det_test_";
+        fps = (FILE **)calloc(classes, sizeof(FILE * ));
+        for (j = 0; j < classes; ++j) {
            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
            fps[j] = fopen(buff, "w");
        }
@ -413,18 +414,18 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out


    int m = plist->size;
-    int i=0;
+    int i = 0;
    int t;

    float thresh = .005;
    float nms = .45;

    int nthreads = 4;
-    image *val = calloc(nthreads, sizeof(image));
-    image *val_resized = calloc(nthreads, sizeof(image));
-    image *buf = calloc(nthreads, sizeof(image));
-    image *buf_resized = calloc(nthreads, sizeof(image));
-    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
+    image *val = (image *)calloc(nthreads, sizeof(image));
+    image *val_resized = (image *)calloc(nthreads, sizeof(image));
+    image *buf = (image *)calloc(nthreads, sizeof(image));
+    image *buf_resized = (image *)calloc(nthreads, sizeof(image));
+    pthread_t *thr = (pthread_t *)calloc(nthreads, sizeof(pthread_t));

    load_args args = {0};
    args.w = net->w;
@ -432,28 +433,28 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
    //args.type = IMAGE_DATA;
    args.type = LETTERBOX_DATA;

-    for(t = 0; t < nthreads; ++t){
-        args.path = paths[i+t];
+    for (t = 0; t < nthreads; ++t) {
+        args.path = paths[i + t];
        args.im = &buf[t];
        args.resized = &buf_resized[t];
        thr[t] = load_data_in_thread(args);
    }
    double start = what_time_is_it_now();
-    for(i = nthreads; i < m+nthreads; i += nthreads){
+    for (i = nthreads; i < m + nthreads; i += nthreads) {
        fprintf(stderr, "%d\n", i);
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            pthread_join(thr[t], 0);
            val[t] = buf[t];
            val_resized[t] = buf_resized[t];
        }
-        for(t = 0; t < nthreads && i+t < m; ++t){
-            args.path = paths[i+t];
+        for (t = 0; t < nthreads && i + t < m; ++t) {
+            args.path = paths[i + t];
            args.im = &buf[t];
            args.resized = &buf_resized[t];
            thr[t] = load_data_in_thread(args);
        }
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            char *path = paths[i+t-nthreads];
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
+            char *path = paths[i + t - nthreads];
            char *id = basecfg(path);
            float *X = val_resized[t].data;
            network_predict(net, X);
@ -462,10 +463,10 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
            int nboxes = 0;
            detection *dets = get_network_boxes(net, w, h, thresh, .5, map, 0, &nboxes);
            if (nms) do_nms_sort(dets, nboxes, classes, nms);
-            if (coco){
+            if (coco) {
                print_cocos(fp, path, dets, nboxes, classes, w, h);
-            } else if (imagenet){
-                print_imagenet_detections(fp, i+t-nthreads+1, dets, nboxes, classes, w, h);
+            } else if (imagenet) {
+                print_imagenet_detections(fp, i + t - nthreads + 1, dets, nboxes, classes, w, h);
            } else {
                print_detector_detections(fps, id, dets, nboxes, classes, w, h);
            }
@ -475,33 +476,32 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
            free_image(val_resized[t]);
        }
    }
-    for(j = 0; j < classes; ++j){
-        if(fps) fclose(fps[j]);
+    for (j = 0; j < classes; ++j) {
+        if (fps) fclose(fps[j]);
    }
-    if(coco){
-        fseek(fp, -2, SEEK_CUR); 
+    if (coco) {
+        fseek(fp, -2, SEEK_CUR);
        fprintf(fp, "\n]\n");
        fclose(fp);
    }
    fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start);
 }

-void validate_detector_recall(char *cfgfile, char *weightfile)
-{
+void validate_detector_recall(char *cfgfile, char *weightfile) {
    network *net = load_network(cfgfile, weightfile, 0);
    set_batch_network(net, 1);
    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
    srand(time(0));

    list *plist = get_paths("data/coco_val_5k.list");
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);

-    layer l = net->layers[net->n-1];
+    layer l = net->layers[net->n - 1];

    int j, k;

    int m = plist->size;
-    int i=0;
+    int i = 0;

    float thresh = .001;
    float iou_thresh = .5;
@ -512,7 +512,7 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
    int proposals = 0;
    float avg_iou = 0;

-    for(i = 0; i < m; ++i){
+    for (i = 0; i < m; ++i) {
        char *path = paths[i];
        image orig = load_image_color(path, 0, 0);
        image sized = resize_image(orig, net->w, net->h);
@ -530,8 +530,8 @@ void validate_detector_recall(char *cfgfile, char *weightfile)

        int num_labels = 0;
        box_label *truth = read_boxes(labelpath, &num_labels);
-        for(k = 0; k < nboxes; ++k){
-            if(dets[k].objectness > thresh){
+        for (k = 0; k < nboxes; ++k) {
+            if (dets[k].objectness > thresh) {
                ++proposals;
            }
        }
@ -539,19 +539,20 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
            ++total;
            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
            float best_iou = 0;
-            for(k = 0; k < l.w*l.h*l.n; ++k){
+            for (k = 0; k < l.w * l.h * l.n; ++k) {
                float iou = box_iou(dets[k].bbox, t);
-                if(dets[k].objectness > thresh && iou > best_iou){
+                if (dets[k].objectness > thresh && iou > best_iou) {
                    best_iou = iou;
                }
            }
            avg_iou += best_iou;
-            if(best_iou > iou_thresh){
+            if (best_iou > iou_thresh) {
                ++correct;
            }
        }

-        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
+        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total,
+                (float) proposals / (i + 1), avg_iou * 100 / total, 100. * correct / total);
        free(id);
        free_image(orig);
        free_image(sized);
@ -559,8 +560,8 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
 }


-void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char *outfile, int fullscreen)
-{
+void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh,
+                   char *outfile, int fullscreen) {
    list *options = read_data_cfg(datacfg);
    char *name_list = option_find_str(options, "names", "data/names.list");
    char **names = get_labels(name_list);
@ -572,30 +573,30 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
    double time;
    char buff[256];
    char *input = buff;
-    float nms=.45;
-    while(1){
-        if(filename){
+    float nms = .45;
+    while (1) {
+        if (filename) {
            strncpy(input, filename, 256);
        } else {
            printf("Enter Image Path: ");
            fflush(stdout);
            input = fgets(input, 256, stdin);
-            if(!input) return;
+            if (!input) return;
            strtok(input, "\n");
        }
-        image im = load_image_color(input,0,0);
+        image im = load_image_color(input, 0, 0);
        image sized = letterbox_image(im, net->w, net->h);
        //image sized = resize_image(im, net->w, net->h);
        //image sized2 = resize_max(im, net->w);
        //image sized = crop_image(sized2, -((net->w - sized2.w)/2), -((net->h - sized2.h)/2), net->w, net->h);
        //resize_network(net, sized.w, sized.h);
-        layer l = net->layers[net->n-1];
+        layer l = net->layers[net->n - 1];


        float *X = sized.data;
-        time=what_time_is_it_now();
+        time = what_time_is_it_now();
        network_predict(net, X);
-        printf("%s: Predicted in %f seconds.\n", input, what_time_is_it_now()-time);
+        printf("%s: Predicted in %f seconds.\n", input, what_time_is_it_now() - time);
        int nboxes = 0;
        detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes);
        //printf("%d\n", nboxes);
@ -603,10 +604,9 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
        draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
        free_detections(dets, nboxes);
-        if(outfile){
+        if (outfile) {
            save_image(im, outfile);
-        }
-        else{
+        } else {
            save_image(im, "predictions");
 #ifdef OPENCV
            make_window("predictions", 512, 512, 0);
@ -621,7 +621,7 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
 }

 /*
-void censor_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int class, float thresh, int skip)
+void censor_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int nclass, float thresh, int skip)
 {
 #ifdef OPENCV
    char *base = basecfg(cfgfile);
@ -668,7 +668,7 @@ void censor_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_ind
        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);

        for(i = 0; i < nboxes; ++i){
-            if(dets[i].prob[class] > thresh){
+            if(dets[i].prob[nclass] > thresh){
                box b = dets[i].bbox;
                int left  = b.x-b.w/2.;
                int top   = b.y-b.h/2.;
@ -694,7 +694,7 @@ void censor_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_ind
    #endif
 }

-void extract_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int class, float thresh, int skip)
+void extract_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int nclass, float thresh, int skip)
 {
 #ifdef OPENCV
    char *base = basecfg(cfgfile);
@ -744,7 +744,7 @@ void extract_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_in
        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);

        for(i = 0; i < nboxes; ++i){
-            if(dets[i].prob[class] > thresh){
+            if(dets[i].prob[nclass] > thresh){
                box b = dets[i].bbox;
                int size = b.w*in.w > b.h*in.h ? b.w*in.w : b.h*in.h;
                int dx  = b.x*in.w-size/2.;
@ -786,15 +786,14 @@ void network_detect(network *net, image im, float thresh, float hier_thresh, flo
 }
 */

-void run_detector(int argc, char **argv)
-{
+void run_detector(int argc, char **argv) {
    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
    float thresh = find_float_arg(argc, argv, "-thresh", .5);
    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
    int cam_index = find_int_arg(argc, argv, "-c", 0);
    int frame_skip = find_int_arg(argc, argv, "-s", 0);
    int avg = find_int_arg(argc, argv, "-avg", 3);
-    if(argc < 4){
+    if (argc < 4) {
        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
        return;
    }
@ -803,18 +802,18 @@ void run_detector(int argc, char **argv)
    int *gpus = 0;
    int gpu = 0;
    int ngpus = 0;
-    if(gpu_list){
+    if (gpu_list) {
        printf("%s\n", gpu_list);
        int len = strlen(gpu_list);
        ngpus = 1;
        int i;
-        for(i = 0; i < len; ++i){
+        for (i = 0; i < len; ++i) {
            if (gpu_list[i] == ',') ++ngpus;
        }
-        gpus = calloc(ngpus, sizeof(int));
-        for(i = 0; i < ngpus; ++i){
+        gpus = (int *) calloc(ngpus, sizeof(int));
+        for (i = 0; i < ngpus; ++i) {
            gpus[i] = atoi(gpu_list);
-            gpu_list = strchr(gpu_list, ',')+1;
+            gpu_list = strchr(gpu_list, ',') + 1;
        }
    } else {
        gpu = gpu_index;
@ -827,24 +826,26 @@ void run_detector(int argc, char **argv)
    int width = find_int_arg(argc, argv, "-w", 0);
    int height = find_int_arg(argc, argv, "-h", 0);
    int fps = find_int_arg(argc, argv, "-fps", 0);
-    //int class = find_int_arg(argc, argv, "-class", 0);
+    //int nclass = find_int_arg(argc, argv, "-nclass", 0);

    char *datacfg = argv[3];
    char *cfg = argv[4];
    char *weights = (argc > 5) ? argv[5] : 0;
-    char *filename = (argc > 6) ? argv[6]: 0;
-    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, outfile, fullscreen);
-    else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
-    else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
-    else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile);
-    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) {
+    char *filename = (argc > 6) ? argv[6] : 0;
+    if (0 == strcmp(argv[2], "test"))
+        test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, outfile, fullscreen);
+    else if (0 == strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
+    else if (0 == strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
+    else if (0 == strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile);
+    else if (0 == strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
+    else if (0 == strcmp(argv[2], "demo")) {
        list *options = read_data_cfg(datacfg);
        int classes = option_find_int(options, "classes", 20);
        char *name_list = option_find_str(options, "names", "data/names.list");
        char **names = get_labels(name_list);
-        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, avg, hier_thresh, width, height, fps, fullscreen);
+        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, avg, hier_thresh, width,
+             height, fps, fullscreen);
    }
-    //else if(0==strcmp(argv[2], "extract")) extract_detector(datacfg, cfg, weights, cam_index, filename, class, thresh, frame_skip);
-    //else if(0==strcmp(argv[2], "censor")) censor_detector(datacfg, cfg, weights, cam_index, filename, class, thresh, frame_skip);
+    //else if(0==strcmp(argv[2], "extract")) extract_detector(datacfg, cfg, weights, cam_index, filename, nclass, thresh, frame_skip);
+    //else if(0==strcmp(argv[2], "censor")) censor_detector(datacfg, cfg, weights, cam_index, filename, nclass, thresh, frame_skip);
 }
--- a/examples/detector.py
+++ b/examples/detector.py
@ -1,27 +0,0 @@
-# Stupid python path shit.
-# Instead just add darknet.py to somewhere in your python path
-# OK actually that might not be a great idea, idk, work in progress
-# Use at your own risk. or don't, i don't care
-
-import sys, os
-sys.path.append(os.path.join(os.getcwd(),'python/'))
-
-import darknet as dn
-import pdb
-
-dn.set_gpu(0)
-net = dn.load_net("cfg/yolo-thor.cfg", "/home/pjreddie/backup/yolo-thor_final.weights", 0)
-meta = dn.load_meta("cfg/thor.data")
-r = dn.detect(net, meta, "data/bedroom.jpg")
-print r
-
-# And then down here you could detect a lot more images like:
-r = dn.detect(net, meta, "data/eagle.jpg")
-print r
-r = dn.detect(net, meta, "data/giraffe.jpg")
-print r
-r = dn.detect(net, meta, "data/horses.jpg")
-print r
-r = dn.detect(net, meta, "data/person.jpg")
-print r
-
--- a/examples/dice.cpp
+++ b/examples/dice.cpp
--- a/examples/go.cpp
+++ b/examples/go.cpp
--- a/examples/instance-segmenter.cpp
+++ b/examples/instance-segmenter.cpp
@ -3,19 +3,19 @@
 #include <assert.h>

 void normalize_image2(image p);
-void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int display)
-{
+
+void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int display) {
    int i;

    float avg_loss = -1;
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
    printf("%d\n", ngpus);
-    network **nets = calloc(ngpus, sizeof(network*));
+    network **nets = (network **) calloc(ngpus, sizeof(network * ));

    srand(time(0));
    int seed = rand();
-    for(i = 0; i < ngpus; ++i){
+    for (i = 0; i < ngpus; ++i) {
        srand(seed);
 #ifdef GPU
        cuda_set_device(gpus[i]);
@ -29,9 +29,9 @@ void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,

    image embed = pred;
    embed.c = 3;
-    embed.data += embed.w*embed.h*80;
+    embed.data += embed.w * embed.h * 80;

-    int div = net->w/pred.w;
+    int div = net->w / pred.w;
    assert(pred.w * div == net->w);
    assert(pred.h * div == net->h);

@ -44,7 +44,7 @@ void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    char *train_list = option_find_str(options, "train", "data/train.list");

    list *plist = get_paths(train_list);
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);
    printf("%d\n", plist->size);
    int N = plist->size;

@ -76,15 +76,15 @@ void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    args.d = &buffer;
    load_thread = load_data(args);

-    int epoch = (*net->seen)/N;
-    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
+    int epoch = (*net->seen) / N;
+    while (get_current_batch(net) < net->max_batches || net->max_batches == 0) {
        double time = what_time_is_it_now();

        pthread_join(load_thread, 0);
        train = buffer;
        load_thread = load_data(args);

-        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
+        printf("Loaded: %lf seconds\n", what_time_is_it_now() - time);
        time = what_time_is_it_now();

        float loss = 0;
@ -97,9 +97,10 @@ void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
 #else
        loss = train_network(net, train);
 #endif
-        if(display){
-            image tr = float_to_image(net->w/div, net->h/div, 80, train.y.vals[net->batch*(net->subdivisions-1)]);
-            image im = float_to_image(net->w, net->h, net->c, train.X.vals[net->batch*(net->subdivisions-1)]);
+        if (display) {
+            image tr = float_to_image(net->w / div, net->h / div, 80,
+                                      train.y.vals[net->batch * (net->subdivisions - 1)]);
+            image im = float_to_image(net->w, net->h, net->c, train.X.vals[net->batch * (net->subdivisions - 1)]);
            pred.c = 80;
            image mask = mask_to_rgb(tr);
            image prmask = mask_to_rgb(pred);
@ -114,19 +115,21 @@ void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
            free_image(mask);
            free_image(prmask);
        }
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
+        if (avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss * .9 + loss * .1;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net),
+               (float) (*net->seen) / N, loss, avg_loss, get_current_rate(net), what_time_is_it_now() - time,
+               *net->seen);
        free_data(train);
-        if(*net->seen/N > epoch){
-            epoch = *net->seen/N;
+        if (*net->seen / N > epoch) {
+            epoch = *net->seen / N;
            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, epoch);
            save_weights(net, buff);
        }
-        if(get_current_batch(net)%100 == 0){
+        if (get_current_batch(net) % 100 == 0) {
            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
            save_weights(net, buff);
        }
    }
@ -135,13 +138,12 @@ void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    save_weights(net, buff);

    free_network(net);
-    free_ptrs((void**)paths, plist->size);
+    free_ptrs((void **) paths, plist->size);
    free_list(plist);
    free(base);
 }

-void predict_isegmenter(char *datafile, char *cfg, char *weights, char *filename)
-{
+void predict_isegmenter(char *datafile, char *cfg, char *weights, char *filename) {
    network *net = load_network(cfg, weights, 0);
    set_batch_network(net, 1);
    srand(2222222);
@ -149,26 +151,26 @@ void predict_isegmenter(char *datafile, char *cfg, char *weights, char *filename
    clock_t time;
    char buff[256];
    char *input = buff;
-    while(1){
-        if(filename){
+    while (1) {
+        if (filename) {
            strncpy(input, filename, 256);
-        }else{
+        } else {
            printf("Enter Image Path: ");
            fflush(stdout);
            input = fgets(input, 256, stdin);
-            if(!input) return;
+            if (!input) return;
            strtok(input, "\n");
        }
        image im = load_image_color(input, 0, 0);
        image sized = letterbox_image(im, net->w, net->h);

        float *X = sized.data;
-        time=clock();
+        time = clock();
        float *predictions = network_predict(net, X);
        image pred = get_network_image(net);
        image prmask = mask_to_rgb(pred);
        printf("Predicted: %f\n", predictions[0]);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock() - time));
        show_image(sized, "orig", 1);
        show_image(prmask, "pred", 0);
        free_image(im);
@ -179,8 +181,7 @@ void predict_isegmenter(char *datafile, char *cfg, char *weights, char *filename
 }


-void demo_isegmenter(char *datacfg, char *cfg, char *weights, int cam_index, const char *filename)
-{
+void demo_isegmenter(char *datacfg, char *cfg, char *weights, int cam_index, const char *filename) {
 #ifdef OPENCV
    printf("Classifier Demo\n");
    network *net = load_network(cfg, weights, 0);
@ -222,9 +223,8 @@ void demo_isegmenter(char *datacfg, char *cfg, char *weights, int cam_index, con
 }


-void run_isegmenter(int argc, char **argv)
-{
-    if(argc < 4){
+void run_isegmenter(int argc, char **argv) {
+    if (argc < 4) {
        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
        return;
    }
@ -233,18 +233,18 @@ void run_isegmenter(int argc, char **argv)
    int *gpus = 0;
    int gpu = 0;
    int ngpus = 0;
-    if(gpu_list){
+    if (gpu_list) {
        printf("%s\n", gpu_list);
        int len = strlen(gpu_list);
        ngpus = 1;
        int i;
-        for(i = 0; i < len; ++i){
+        for (i = 0; i < len; ++i) {
            if (gpu_list[i] == ',') ++ngpus;
        }
-        gpus = calloc(ngpus, sizeof(int));
-        for(i = 0; i < ngpus; ++i){
+        gpus = (int *) calloc(ngpus, sizeof(int));
+        for (i = 0; i < ngpus; ++i) {
            gpus[i] = atoi(gpu_list);
-            gpu_list = strchr(gpu_list, ',')+1;
+            gpu_list = strchr(gpu_list, ',') + 1;
        }
    } else {
        gpu = gpu_index;
@ -258,10 +258,10 @@ void run_isegmenter(int argc, char **argv)
    char *data = argv[3];
    char *cfg = argv[4];
    char *weights = (argc > 5) ? argv[5] : 0;
-    char *filename = (argc > 6) ? argv[6]: 0;
-    if(0==strcmp(argv[2], "test")) predict_isegmenter(data, cfg, weights, filename);
-    else if(0==strcmp(argv[2], "train")) train_isegmenter(data, cfg, weights, gpus, ngpus, clear, display);
-    else if(0==strcmp(argv[2], "demo")) demo_isegmenter(data, cfg, weights, cam_index, filename);
+    char *filename = (argc > 6) ? argv[6] : 0;
+    if (0 == strcmp(argv[2], "test")) predict_isegmenter(data, cfg, weights, filename);
+    else if (0 == strcmp(argv[2], "train")) train_isegmenter(data, cfg, weights, gpus, ngpus, clear, display);
+    else if (0 == strcmp(argv[2], "demo")) demo_isegmenter(data, cfg, weights, cam_index, filename);
 }


--- a/examples/lsd.cpp
+++ b/examples/lsd.cpp
@ -682,7 +682,7 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
    //float orig_rate = anet->learning_rate;

    int i, j, k;
-    layer imlayer = {0};
+    layer imlayer = {(LAYER_TYPE)0};
    for (i = 0; i < gnet->n; ++i) {
        if (gnet->layers[i].out_c == 3) {
            imlayer = gnet->layers[i];
@ -878,7 +878,7 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
    network *anet = load_network(acfg, aweight, clear);

    int i, j, k;
-    layer imlayer = {0};
+    layer imlayer = {(LAYER_TYPE)0};
    for (i = 0; i < net->n; ++i) {
        if (net->layers[i].out_c == 3) {
            imlayer = net->layers[i];
@ -914,8 +914,8 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
    //int y_size = x_size;
    net->delta = 0;
    net->train = 1;
-    float *pixs = calloc(x_size, sizeof(float));
-    float *graypixs = calloc(x_size, sizeof(float));
+    float *pixs = (float *) calloc(x_size, sizeof(float));
+    float *graypixs = (float *) calloc(x_size, sizeof(float));
    //float *y = calloc(y_size, sizeof(float));

    //int ay_size = anet->outputs*anet->batch;
--- a/examples/nightmare.cpp
+++ b/examples/nightmare.cpp
--- a/examples/regressor.cpp
+++ b/examples/regressor.cpp
@ -2,19 +2,18 @@
 #include <sys/time.h>
 #include <assert.h>

-void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
-{
+void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear) {
    int i;

    float avg_loss = -1;
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
    printf("%d\n", ngpus);
-    network **nets = calloc(ngpus, sizeof(network*));
+    network **nets = (network **) calloc(ngpus, sizeof(network * ));

    srand(time(0));
    int seed = rand();
-    for(i = 0; i < ngpus; ++i){
+    for (i = 0; i < ngpus; ++i) {
        srand(seed);
 #ifdef GPU
        cuda_set_device(gpus[i]);
@ -35,7 +34,7 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    int classes = option_find_int(options, "classes", 1);

    list *plist = get_paths(train_list);
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);
    printf("%d\n", plist->size);
    int N = plist->size;
    clock_t time;
@ -46,8 +45,8 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    args.threads = 32;
    args.classes = classes;

-    args.min = net->min_ratio*net->w;
-    args.max = net->max_ratio*net->w;
+    args.min = net->min_ratio * net->w;
+    args.max = net->max_ratio * net->w;
    args.angle = net->angle;
    args.aspect = net->aspect;
    args.exposure = net->exposure;
@ -66,16 +65,16 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    args.d = &buffer;
    load_thread = load_data(args);

-    int epoch = (*net->seen)/N;
-    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
-        time=clock();
+    int epoch = (*net->seen) / N;
+    while (get_current_batch(net) < net->max_batches || net->max_batches == 0) {
+        time = clock();

        pthread_join(load_thread, 0);
        train = buffer;
        load_thread = load_data(args);

-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-        time=clock();
+        printf("Loaded: %lf seconds\n", sec(clock() - time));
+        time = clock();

        float loss = 0;
 #ifdef GPU
@ -87,19 +86,20 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
 #else
        loss = train_network(net, train);
 #endif
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net->seen);
+        if (avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss * .9 + loss * .1;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net),
+               (float) (*net->seen) / N, loss, avg_loss, get_current_rate(net), sec(clock() - time), *net->seen);
        free_data(train);
-        if(*net->seen/N > epoch){
-            epoch = *net->seen/N;
+        if (*net->seen / N > epoch) {
+            epoch = *net->seen / N;
            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, epoch);
            save_weights(net, buff);
        }
-        if(get_current_batch(net)%100 == 0){
+        if (get_current_batch(net) % 100 == 0) {
            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
            save_weights(net, buff);
        }
    }
@ -108,13 +108,12 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    save_weights(net, buff);

    free_network(net);
-    free_ptrs((void**)paths, plist->size);
+    free_ptrs((void **) paths, plist->size);
    free_list(plist);
    free(base);
 }

-void predict_regressor(char *cfgfile, char *weightfile, char *filename)
-{
+void predict_regressor(char *cfgfile, char *weightfile, char *filename) {
    network *net = load_network(cfgfile, weightfile, 0);
    set_batch_network(net, 1);
    srand(2222222);
@ -122,24 +121,24 @@ void predict_regressor(char *cfgfile, char *weightfile, char *filename)
    clock_t time;
    char buff[256];
    char *input = buff;
-    while(1){
-        if(filename){
+    while (1) {
+        if (filename) {
            strncpy(input, filename, 256);
-        }else{
+        } else {
            printf("Enter Image Path: ");
            fflush(stdout);
            input = fgets(input, 256, stdin);
-            if(!input) return;
+            if (!input) return;
            strtok(input, "\n");
        }
        image im = load_image_color(input, 0, 0);
        image sized = letterbox_image(im, net->w, net->h);

        float *X = sized.data;
-        time=clock();
+        time = clock();
        float *predictions = network_predict(net, X);
        printf("Predicted: %f\n", predictions[0]);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock() - time));
        free_image(im);
        free_image(sized);
        if (filename) break;
@ -147,8 +146,7 @@ void predict_regressor(char *cfgfile, char *weightfile, char *filename)
 }


-void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
-{
+void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename) {
 #ifdef OPENCV
    printf("Regressor Demo\n");
    network *net = load_network(cfgfile, weightfile, 0);
@ -196,9 +194,8 @@ void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
 }


-void run_regressor(int argc, char **argv)
-{
-    if(argc < 4){
+void run_regressor(int argc, char **argv) {
+    if (argc < 4) {
        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
        return;
    }
@ -207,18 +204,18 @@ void run_regressor(int argc, char **argv)
    int *gpus = 0;
    int gpu = 0;
    int ngpus = 0;
-    if(gpu_list){
+    if (gpu_list) {
        printf("%s\n", gpu_list);
        int len = strlen(gpu_list);
        ngpus = 1;
        int i;
-        for(i = 0; i < len; ++i){
+        for (i = 0; i < len; ++i) {
            if (gpu_list[i] == ',') ++ngpus;
        }
-        gpus = calloc(ngpus, sizeof(int));
-        for(i = 0; i < ngpus; ++i){
+        gpus = (int *) calloc(ngpus, sizeof(int));
+        for (i = 0; i < ngpus; ++i) {
            gpus[i] = atoi(gpu_list);
-            gpu_list = strchr(gpu_list, ',')+1;
+            gpu_list = strchr(gpu_list, ',') + 1;
        }
    } else {
        gpu = gpu_index;
@ -231,10 +228,10 @@ void run_regressor(int argc, char **argv)
    char *data = argv[3];
    char *cfg = argv[4];
    char *weights = (argc > 5) ? argv[5] : 0;
-    char *filename = (argc > 6) ? argv[6]: 0;
-    if(0==strcmp(argv[2], "test")) predict_regressor(data, cfg, weights);
-    else if(0==strcmp(argv[2], "train")) train_regressor(data, cfg, weights, gpus, ngpus, clear);
-    else if(0==strcmp(argv[2], "demo")) demo_regressor(data, cfg, weights, cam_index, filename);
+    char *filename = (argc > 6) ? argv[6] : 0;
+    if (0 == strcmp(argv[2], "test")) predict_regressor(data, cfg, weights);
+    else if (0 == strcmp(argv[2], "train")) train_regressor(data, cfg, weights, gpus, ngpus, clear);
+    else if (0 == strcmp(argv[2], "demo")) demo_regressor(data, cfg, weights, cam_index, filename);
 }


--- a/examples/rnn.cpp
+++ b/examples/rnn.cpp
@ -11,7 +11,7 @@ unsigned char **load_files(char *filename, int *n)
 {
    list *paths = get_paths(filename);
    *n = paths->size;
-    unsigned char **contents = calloc(*n, sizeof(char *));
+    unsigned char **contents = (unsigned char **)calloc(*n, sizeof(char *));
    int i;
    node *x = paths->front;
    for(i = 0; i < *n; ++i){
@ -26,20 +26,20 @@ int *read_tokenized_data(char *filename, size_t *read)
    size_t size = 512;
    size_t count = 0;
    FILE *fp = fopen(filename, "r");
-    int *d = calloc(size, sizeof(int));
+    int *d = (int *)calloc(size, sizeof(int));
    int n, one;
    one = fscanf(fp, "%d", &n);
    while(one == 1){
        ++count;
        if(count > size){
            size = size*2;
-            d = realloc(d, size*sizeof(int));
+            d = (int *) realloc(d, size*sizeof(int));
        }
        d[count-1] = n;
        one = fscanf(fp, "%d", &n);
    }
    fclose(fp);
-    d = realloc(d, count*sizeof(int));
+    d = (int *) realloc(d, count*sizeof(int));
    *read = count;
    return d;
 }
@ -49,19 +49,19 @@ char **read_tokens(char *filename, size_t *read)
    size_t size = 512;
    size_t count = 0;
    FILE *fp = fopen(filename, "r");
-    char **d = calloc(size, sizeof(char *));
+    char **d = (char **)calloc(size, sizeof(char *));
    char *line;
    while((line=fgetl(fp)) != 0){
        ++count;
        if(count > size){
            size = size*2;
-            d = realloc(d, size*sizeof(char *));
+            d = (char **) realloc(d, size*sizeof(char *));
        }
        if(0==strcmp(line, "<NEWLINE>")) line = "\n";
        d[count-1] = line;
    }
    fclose(fp);
-    d = realloc(d, count*sizeof(char *));
+    d = (char **) realloc(d, count*sizeof(char *));
    *read = count;
    return d;
 }
@ -69,8 +69,8 @@ char **read_tokens(char *filename, size_t *read)

 float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size_t len, int batch, int steps)
 {
-    float *x = calloc(batch * steps * characters, sizeof(float));
-    float *y = calloc(batch * steps * characters, sizeof(float));
+    float *x = (float*) calloc(batch * steps * characters, sizeof(float));
+    float *y = (float*) calloc(batch * steps * characters, sizeof(float));
    int i,j;
    for(i = 0; i < batch; ++i){
        for(j = 0; j < steps; ++j){
@ -96,8 +96,8 @@ float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size
 float_pair get_seq2seq_data(char **source, char **dest, int n, int characters, size_t len, int batch, int steps)
 {
    int i,j;
-    float *x = calloc(batch * steps * characters, sizeof(float));
-    float *y = calloc(batch * steps * characters, sizeof(float));
+    float *x = (float*) calloc(batch * steps * characters, sizeof(float));
+    float *y = (float*) calloc(batch * steps * characters, sizeof(float));
    for(i = 0; i < batch; ++i){
        int index = rand()%n;
        //int slen = strlen(source[index]);
@ -126,8 +126,8 @@ float_pair get_seq2seq_data(char **source, char **dest, int n, int characters, s

 float_pair get_rnn_data(unsigned char *text, size_t *offsets, int characters, size_t len, int batch, int steps)
 {
-    float *x = calloc(batch * steps * characters, sizeof(float));
-    float *y = calloc(batch * steps * characters, sizeof(float));
+    float *x = (float*) calloc(batch * steps * characters, sizeof(float));
+    float *y = (float*) calloc(batch * steps * characters, sizeof(float));
    int i,j;
    for(i = 0; i < batch; ++i){
        for(j = 0; j < steps; ++j){
@ -181,7 +181,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
    int i = (*net->seen)/net->batch;

    int streams = batch/steps;
-    size_t *offsets = calloc(streams, sizeof(size_t));
+    size_t *offsets = (size_t *)calloc(streams, sizeof(size_t));
    int j;
    for(j = 0; j < streams; ++j){
        offsets[j] = rand_size_t()%size;
@ -261,7 +261,7 @@ void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float t
    for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
    int c = 0;
    int len = strlen(seed);
-    float *input = calloc(inputs, sizeof(float));
+    float *input = (float*) calloc(inputs, sizeof(float));

    /*
       fill_cpu(inputs, 0, input, 1);
@ -314,7 +314,7 @@ void test_tactic_rnn_multi(char *cfgfile, char *weightfile, int num, float temp,
    int i, j;
    for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
    int c = 0;
-    float *input = calloc(inputs, sizeof(float));
+    float *input = (float*) calloc(inputs, sizeof(float));
    float *out = 0;

    while(1){
@ -359,7 +359,7 @@ void test_tactic_rnn(char *cfgfile, char *weightfile, int num, float temp, int r
    int i, j;
    for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
    int c = 0;
-    float *input = calloc(inputs, sizeof(float));
+    float *input = (float*) calloc(inputs, sizeof(float));
    float *out = 0;

    while((c = getc(stdin)) != EOF){
@ -395,7 +395,7 @@ void valid_tactic_rnn(char *cfgfile, char *weightfile, char *seed)
    int words = 1;
    int c;
    int len = strlen(seed);
-    float *input = calloc(inputs, sizeof(float));
+    float *input = (float*) calloc(inputs, sizeof(float));
    int i;
    for(i = 0; i < len; ++i){
        c = seed[i];
@ -444,7 +444,7 @@ void valid_char_rnn(char *cfgfile, char *weightfile, char *seed)
    int words = 1;
    int c;
    int len = strlen(seed);
-    float *input = calloc(inputs, sizeof(float));
+    float *input = (float*) calloc(inputs, sizeof(float));
    int i;
    for(i = 0; i < len; ++i){
        c = seed[i];
@ -480,7 +480,7 @@ void vec_char_rnn(char *cfgfile, char *weightfile, char *seed)

    int c;
    int seed_len = strlen(seed);
-    float *input = calloc(inputs, sizeof(float));
+    float *input = (float*) calloc(inputs, sizeof(float));
    int i;
    char *line;
    while((line=fgetl(stdin)) != 0){
--- a/examples/rnn_vid.cpp
+++ b/examples/rnn_vid.cpp
@ -19,10 +19,10 @@ float_pair get_rnn_vid_data(network net, char **files, int n, int batch, int ste
    image out_im = get_network_image(net);
    int output_size = out_im.w*out_im.h*out_im.c;
    printf("%d %d %d\n", out_im.w, out_im.h, out_im.c);
-    float *feats = calloc(net.batch*batch*output_size, sizeof(float));
+    float *feats = (float*) calloc(net.batch*batch*output_size, sizeof(float));
    for(b = 0; b < batch; ++b){
        int input_size = net.w*net.h*net.c;
-        float *input = calloc(input_size*net.batch, sizeof(float));
+        float *input = (float*) calloc(input_size*net.batch, sizeof(float));
        char *filename = files[rand()%n];
        CvCapture *cap = cvCaptureFromFile(filename);
        int frames = cvGetCaptureProperty(cap, CV_CAP_PROP_FRAME_COUNT);
@ -183,9 +183,9 @@ void generate_vid_rnn(char *cfgfile, char *weightfile)
    }
    for(i = 0; i < 30; ++i){
        next = network_predict(net, next);
-        image new = save_reconstruction(extractor, &last, next, "new", i);
+        image new_image = save_reconstruction(extractor, &last, next, "new_image", i);
        free_image(last);
-        last = new;
+        last = new_image;
    }
 }

@ -203,6 +203,8 @@ void run_vid_rnn(int argc, char **argv)
    else if(0==strcmp(argv[2], "generate")) generate_vid_rnn(cfg, weights);
 }
 #else
-void run_vid_rnn(int argc, char **argv){}
+
+void run_vid_rnn(int argc, char **argv) {}
+
 #endif

--- a/examples/segmenter.cpp
+++ b/examples/segmenter.cpp
@ -2,19 +2,18 @@
 #include <sys/time.h>
 #include <assert.h>

-void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int display)
-{
+void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int display) {
    int i;

    float avg_loss = -1;
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
    printf("%d\n", ngpus);
-    network **nets = calloc(ngpus, sizeof(network*));
+    network **nets = (network **) calloc(ngpus, sizeof(network * ));

    srand(time(0));
    int seed = rand();
-    for(i = 0; i < ngpus; ++i){
+    for (i = 0; i < ngpus; ++i) {
        srand(seed);
 #ifdef GPU
        cuda_set_device(gpus[i]);
@ -26,7 +25,7 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    network *net = nets[0];
    image pred = get_network_image(net);

-    int div = net->w/pred.w;
+    int div = net->w / pred.w;
    assert(pred.w * div == net->w);
    assert(pred.h * div == net->h);

@ -39,7 +38,7 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    char *train_list = option_find_str(options, "train", "data/train.list");

    list *plist = get_paths(train_list);
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);
    printf("%d\n", plist->size);
    int N = plist->size;

@ -70,15 +69,15 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    args.d = &buffer;
    load_thread = load_data(args);

-    int epoch = (*net->seen)/N;
-    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
+    int epoch = (*net->seen) / N;
+    while (get_current_batch(net) < net->max_batches || net->max_batches == 0) {
        double time = what_time_is_it_now();

        pthread_join(load_thread, 0);
        train = buffer;
        load_thread = load_data(args);

-        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
+        printf("Loaded: %lf seconds\n", what_time_is_it_now() - time);
        time = what_time_is_it_now();

        float loss = 0;
@ -91,9 +90,10 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
 #else
        loss = train_network(net, train);
 #endif
-        if(display){
-            image tr = float_to_image(net->w/div, net->h/div, 80, train.y.vals[net->batch*(net->subdivisions-1)]);
-            image im = float_to_image(net->w, net->h, net->c, train.X.vals[net->batch*(net->subdivisions-1)]);
+        if (display) {
+            image tr = float_to_image(net->w / div, net->h / div, 80,
+                                      train.y.vals[net->batch * (net->subdivisions - 1)]);
+            image im = float_to_image(net->w, net->h, net->c, train.X.vals[net->batch * (net->subdivisions - 1)]);
            image mask = mask_to_rgb(tr);
            image prmask = mask_to_rgb(pred);
            show_image(im, "input", 1);
@ -102,19 +102,21 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
            free_image(mask);
            free_image(prmask);
        }
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
+        if (avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss * .9 + loss * .1;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net),
+               (float) (*net->seen) / N, loss, avg_loss, get_current_rate(net), what_time_is_it_now() - time,
+               *net->seen);
        free_data(train);
-        if(*net->seen/N > epoch){
-            epoch = *net->seen/N;
+        if (*net->seen / N > epoch) {
+            epoch = *net->seen / N;
            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, epoch);
            save_weights(net, buff);
        }
-        if(get_current_batch(net)%100 == 0){
+        if (get_current_batch(net) % 100 == 0) {
            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
            save_weights(net, buff);
        }
    }
@ -123,13 +125,12 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
    save_weights(net, buff);

    free_network(net);
-    free_ptrs((void**)paths, plist->size);
+    free_ptrs((void **) paths, plist->size);
    free_list(plist);
    free(base);
 }

-void predict_segmenter(char *datafile, char *cfg, char *weights, char *filename)
-{
+void predict_segmenter(char *datafile, char *cfg, char *weights, char *filename) {
    network *net = load_network(cfg, weights, 0);
    set_batch_network(net, 1);
    srand(2222222);
@ -137,26 +138,26 @@ void predict_segmenter(char *datafile, char *cfg, char *weights, char *filename)
    clock_t time;
    char buff[256];
    char *input = buff;
-    while(1){
-        if(filename){
+    while (1) {
+        if (filename) {
            strncpy(input, filename, 256);
-        }else{
+        } else {
            printf("Enter Image Path: ");
            fflush(stdout);
            input = fgets(input, 256, stdin);
-            if(!input) return;
+            if (!input) return;
            strtok(input, "\n");
        }
        image im = load_image_color(input, 0, 0);
        image sized = letterbox_image(im, net->w, net->h);

        float *X = sized.data;
-        time=clock();
+        time = clock();
        float *predictions = network_predict(net, X);
        image pred = get_network_image(net);
        image prmask = mask_to_rgb(pred);
        printf("Predicted: %f\n", predictions[0]);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock() - time));
        show_image(sized, "orig", 1);
        show_image(prmask, "pred", 0);
        free_image(im);
@ -167,8 +168,7 @@ void predict_segmenter(char *datafile, char *cfg, char *weights, char *filename)
 }


-void demo_segmenter(char *datacfg, char *cfg, char *weights, int cam_index, const char *filename)
-{
+void demo_segmenter(char *datacfg, char *cfg, char *weights, int cam_index, const char *filename) {
 #ifdef OPENCV
    printf("Classifier Demo\n");
    network *net = load_network(cfg, weights, 0);
@ -210,9 +210,8 @@ void demo_segmenter(char *datacfg, char *cfg, char *weights, int cam_index, cons
 }


-void run_segmenter(int argc, char **argv)
-{
-    if(argc < 4){
+void run_segmenter(int argc, char **argv) {
+    if (argc < 4) {
        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
        return;
    }
@ -221,18 +220,18 @@ void run_segmenter(int argc, char **argv)
    int *gpus = 0;
    int gpu = 0;
    int ngpus = 0;
-    if(gpu_list){
+    if (gpu_list) {
        printf("%s\n", gpu_list);
        int len = strlen(gpu_list);
        ngpus = 1;
        int i;
-        for(i = 0; i < len; ++i){
+        for (i = 0; i < len; ++i) {
            if (gpu_list[i] == ',') ++ngpus;
        }
-        gpus = calloc(ngpus, sizeof(int));
-        for(i = 0; i < ngpus; ++i){
+        gpus = (int *) calloc(ngpus, sizeof(int));
+        for (i = 0; i < ngpus; ++i) {
            gpus[i] = atoi(gpu_list);
-            gpu_list = strchr(gpu_list, ',')+1;
+            gpu_list = strchr(gpu_list, ',') + 1;
        }
    } else {
        gpu = gpu_index;
@ -246,10 +245,10 @@ void run_segmenter(int argc, char **argv)
    char *data = argv[3];
    char *cfg = argv[4];
    char *weights = (argc > 5) ? argv[5] : 0;
-    char *filename = (argc > 6) ? argv[6]: 0;
-    if(0==strcmp(argv[2], "test")) predict_segmenter(data, cfg, weights, filename);
-    else if(0==strcmp(argv[2], "train")) train_segmenter(data, cfg, weights, gpus, ngpus, clear, display);
-    else if(0==strcmp(argv[2], "demo")) demo_segmenter(data, cfg, weights, cam_index, filename);
+    char *filename = (argc > 6) ? argv[6] : 0;
+    if (0 == strcmp(argv[2], "test")) predict_segmenter(data, cfg, weights, filename);
+    else if (0 == strcmp(argv[2], "train")) train_segmenter(data, cfg, weights, gpus, ngpus, clear, display);
+    else if (0 == strcmp(argv[2], "demo")) demo_segmenter(data, cfg, weights, cam_index, filename);
 }


--- a/examples/super.cpp
+++ b/examples/super.cpp
--- a/examples/swag.cpp
+++ b/examples/swag.cpp
--- a/examples/tag.cpp
+++ b/examples/tag.cpp
--- a/examples/voxel.cpp
+++ b/examples/voxel.cpp
--- a/examples/writing.cpp
+++ b/examples/writing.cpp
--- a/examples/yolo.cpp
+++ b/examples/yolo.cpp
@ -1,9 +1,10 @@
 #include "darknet.h"

-char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};
+char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
+                     "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train",
+                     "tvmonitor"};

-void train_yolo(char *cfgfile, char *weightfile)
-{
+void train_yolo(char *cfgfile, char *weightfile) {
    char *train_images = "/data/voc/train.txt";
    char *backup_directory = "/home/pjreddie/backup/";
    srand(time(0));
@ -12,8 +13,8 @@ void train_yolo(char *cfgfile, char *weightfile)
    float avg_loss = -1;
    network *net = load_network(cfgfile, weightfile, 0);
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
-    int imgs = net->batch*net->subdivisions;
-    int i = *net->seen/imgs;
+    int imgs = net->batch * net->subdivisions;
+    int i = *net->seen / imgs;
    data train, buffer;


@ -25,7 +26,7 @@ void train_yolo(char *cfgfile, char *weightfile)

    list *plist = get_paths(train_images);
    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);

    load_args args = {0};
    args.w = net->w;
@ -47,22 +48,23 @@ void train_yolo(char *cfgfile, char *weightfile)
    pthread_t load_thread = load_data_in_thread(args);
    clock_t time;
    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net->max_batches){
+    while (get_current_batch(net) < net->max_batches) {
        i += 1;
-        time=clock();
+        time = clock();
        pthread_join(load_thread, 0);
        train = buffer;
        load_thread = load_data_in_thread(args);

-        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        printf("Loaded: %lf seconds\n", sec(clock() - time));

-        time=clock();
+        time = clock();
        float loss = train_network(net, train);
        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
+        avg_loss = avg_loss * .9 + loss * .1;

-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0 || (i < 1000 && i%100 == 0)){
+        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net),
+               sec(clock() - time), i * imgs);
+        if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) {
            char buff[256];
            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
            save_weights(net, buff);
@ -74,29 +76,28 @@ void train_yolo(char *cfgfile, char *weightfile)
    save_weights(net, buff);
 }

-void print_yolo_detections(FILE **fps, char *id, int total, int classes, int w, int h, detection *dets)
-{
+void print_yolo_detections(FILE **fps, char *id, int total, int classes, int w, int h, detection *dets) {
    int i, j;
-    for(i = 0; i < total; ++i){
-        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
-        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
-        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
-        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
+    for (i = 0; i < total; ++i) {
+        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;

        if (xmin < 0) xmin = 0;
        if (ymin < 0) ymin = 0;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;

-        for(j = 0; j < classes; ++j){
-            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
-                    xmin, ymin, xmax, ymax);
+        for (j = 0; j < classes; ++j) {
+            if (dets[i].prob[j])
+                fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
+                        xmin, ymin, xmax, ymax);
        }
    }
 }

-void validate_yolo(char *cfg, char *weights)
-{
+void validate_yolo(char *cfg, char *weights) {
    network *net = load_network(cfg, weights, 0);
    set_batch_network(net, 1);
    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
@ -106,21 +107,21 @@ void validate_yolo(char *cfg, char *weights)
    //list *plist = get_paths("data/voc.2007.test");
    list *plist = get_paths("/home/pjreddie/data/voc/2007_test.txt");
    //list *plist = get_paths("data/voc.2012.test");
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);

-    layer l = net->layers[net->n-1];
+    layer l = net->layers[net->n - 1];
    int classes = l.classes;

    int j;
-    FILE **fps = calloc(classes, sizeof(FILE *));
-    for(j = 0; j < classes; ++j){
+    FILE **fps = (FILE **) calloc(classes, sizeof(FILE * ));
+    for (j = 0; j < classes; ++j) {
        char buff[1024];
        snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
        fps[j] = fopen(buff, "w");
    }

    int m = plist->size;
-    int i=0;
+    int i = 0;
    int t;

    float thresh = .001;
@ -128,39 +129,39 @@ void validate_yolo(char *cfg, char *weights)
    float iou_thresh = .5;

    int nthreads = 8;
-    image *val = calloc(nthreads, sizeof(image));
-    image *val_resized = calloc(nthreads, sizeof(image));
-    image *buf = calloc(nthreads, sizeof(image));
-    image *buf_resized = calloc(nthreads, sizeof(image));
-    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
+    image *val = (image *) calloc(nthreads, sizeof(image));
+    image *val_resized = (image *) calloc(nthreads, sizeof(image));
+    image *buf = (image *) calloc(nthreads, sizeof(image));
+    image *buf_resized = (image *) calloc(nthreads, sizeof(image));
+    pthread_t *thr = (pthread_t *) calloc(nthreads, sizeof(pthread_t));

    load_args args = {0};
    args.w = net->w;
    args.h = net->h;
    args.type = IMAGE_DATA;

-    for(t = 0; t < nthreads; ++t){
-        args.path = paths[i+t];
+    for (t = 0; t < nthreads; ++t) {
+        args.path = paths[i + t];
        args.im = &buf[t];
        args.resized = &buf_resized[t];
        thr[t] = load_data_in_thread(args);
    }
    time_t start = time(0);
-    for(i = nthreads; i < m+nthreads; i += nthreads){
+    for (i = nthreads; i < m + nthreads; i += nthreads) {
        fprintf(stderr, "%d\n", i);
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            pthread_join(thr[t], 0);
            val[t] = buf[t];
            val_resized[t] = buf_resized[t];
        }
-        for(t = 0; t < nthreads && i+t < m; ++t){
-            args.path = paths[i+t];
+        for (t = 0; t < nthreads && i + t < m; ++t) {
+            args.path = paths[i + t];
            args.im = &buf[t];
            args.resized = &buf_resized[t];
            thr[t] = load_data_in_thread(args);
        }
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            char *path = paths[i+t-nthreads];
+        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
+            char *path = paths[i + t - nthreads];
            char *id = basecfg(path);
            float *X = val_resized[t].data;
            network_predict(net, X);
@ -168,19 +169,18 @@ void validate_yolo(char *cfg, char *weights)
            int h = val[t].h;
            int nboxes = 0;
            detection *dets = get_network_boxes(net, w, h, thresh, 0, 0, 0, &nboxes);
-            if (nms) do_nms_sort(dets, l.side*l.side*l.n, classes, iou_thresh);
-            print_yolo_detections(fps, id, l.side*l.side*l.n, classes, w, h, dets);
+            if (nms) do_nms_sort(dets, l.side * l.side * l.n, classes, iou_thresh);
+            print_yolo_detections(fps, id, l.side * l.side * l.n, classes, w, h, dets);
            free_detections(dets, nboxes);
            free(id);
            free_image(val[t]);
            free_image(val_resized[t]);
        }
    }
-    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double) (time(0) - start));
 }

-void validate_yolo_recall(char *cfg, char *weights)
-{
+void validate_yolo_recall(char *cfg, char *weights) {
    network *net = load_network(cfg, weights, 0);
    set_batch_network(net, 1);
    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
@ -188,22 +188,22 @@ void validate_yolo_recall(char *cfg, char *weights)

    char *base = "results/comp4_det_test_";
    list *plist = get_paths("data/voc.2007.test");
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);

-    layer l = net->layers[net->n-1];
+    layer l = net->layers[net->n - 1];
    int classes = l.classes;
    int side = l.side;

    int j, k;
-    FILE **fps = calloc(classes, sizeof(FILE *));
-    for(j = 0; j < classes; ++j){
+    FILE **fps = (FILE **) calloc(classes, sizeof(FILE * ));
+    for (j = 0; j < classes; ++j) {
        char buff[1024];
        snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
        fps[j] = fopen(buff, "w");
    }

    int m = plist->size;
-    int i=0;
+    int i = 0;

    float thresh = .001;
    float iou_thresh = .5;
@ -214,7 +214,7 @@ void validate_yolo_recall(char *cfg, char *weights)
    int proposals = 0;
    float avg_iou = 0;

-    for(i = 0; i < m; ++i){
+    for (i = 0; i < m; ++i) {
        char *path = paths[i];
        image orig = load_image_color(path, 0, 0);
        image sized = resize_image(orig, net->w, net->h);
@ -223,7 +223,7 @@ void validate_yolo_recall(char *cfg, char *weights)

        int nboxes = 0;
        detection *dets = get_network_boxes(net, orig.w, orig.h, thresh, 0, 0, 1, &nboxes);
-        if (nms) do_nms_obj(dets, side*side*l.n, 1, nms);
+        if (nms) do_nms_obj(dets, side * side * l.n, 1, nms);

        char labelpath[4096];
        find_replace(path, "images", "labels", labelpath);
@ -233,8 +233,8 @@ void validate_yolo_recall(char *cfg, char *weights)

        int num_labels = 0;
        box_label *truth = read_boxes(labelpath, &num_labels);
-        for(k = 0; k < side*side*l.n; ++k){
-            if(dets[k].objectness > thresh){
+        for (k = 0; k < side * side * l.n; ++k) {
+            if (dets[k].objectness > thresh) {
                ++proposals;
            }
        }
@ -242,19 +242,20 @@ void validate_yolo_recall(char *cfg, char *weights)
            ++total;
            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
            float best_iou = 0;
-            for(k = 0; k < side*side*l.n; ++k){
+            for (k = 0; k < side * side * l.n; ++k) {
                float iou = box_iou(dets[k].bbox, t);
-                if(dets[k].objectness > thresh && iou > best_iou){
+                if (dets[k].objectness > thresh && iou > best_iou) {
                    best_iou = iou;
                }
            }
            avg_iou += best_iou;
-            if(best_iou > iou_thresh){
+            if (best_iou > iou_thresh) {
                ++correct;
            }
        }

-        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
+        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total,
+                (float) proposals / (i + 1), avg_iou * 100 / total, 100. * correct / total);
        free_detections(dets, nboxes);
        free(id);
        free_image(orig);
@ -262,39 +263,38 @@ void validate_yolo_recall(char *cfg, char *weights)
    }
 }

-void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
-{
+void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh) {
    image **alphabet = load_alphabet();
    network *net = load_network(cfgfile, weightfile, 0);
-    layer l = net->layers[net->n-1];
+    layer l = net->layers[net->n - 1];
    set_batch_network(net, 1);
    srand(2222222);
    clock_t time;
    char buff[256];
    char *input = buff;
-    float nms=.4;
-    while(1){
-        if(filename){
+    float nms = .4;
+    while (1) {
+        if (filename) {
            strncpy(input, filename, 256);
        } else {
            printf("Enter Image Path: ");
            fflush(stdout);
            input = fgets(input, 256, stdin);
-            if(!input) return;
+            if (!input) return;
            strtok(input, "\n");
        }
-        image im = load_image_color(input,0,0);
+        image im = load_image_color(input, 0, 0);
        image sized = resize_image(im, net->w, net->h);
        float *X = sized.data;
-        time=clock();
+        time = clock();
        network_predict(net, X);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock() - time));

        int nboxes = 0;
        detection *dets = get_network_boxes(net, 1, 1, thresh, 0, 0, 0, &nboxes);
-        if (nms) do_nms_sort(dets, l.side*l.side*l.n, l.classes, nms);
+        if (nms) do_nms_sort(dets, l.side * l.side * l.n, l.classes, nms);

-        draw_detections(im, dets, l.side*l.side*l.n, thresh, voc_names, alphabet, 20);
+        draw_detections(im, dets, l.side * l.side * l.n, thresh, voc_names, alphabet, 20);
        save_image(im, "predictions");
        show_image(im, "predictions", 0);
        free_detections(dets, nboxes);
@ -304,13 +304,12 @@ void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
    }
 }

-void run_yolo(int argc, char **argv)
-{
+void run_yolo(int argc, char **argv) {
    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
    float thresh = find_float_arg(argc, argv, "-thresh", .2);
    int cam_index = find_int_arg(argc, argv, "-c", 0);
    int frame_skip = find_int_arg(argc, argv, "-s", 0);
-    if(argc < 4){
+    if (argc < 4) {
        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
        return;
    }
@ -318,10 +317,11 @@ void run_yolo(int argc, char **argv)
    int avg = find_int_arg(argc, argv, "-avg", 1);
    char *cfg = argv[3];
    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5]: 0;
-    if(0==strcmp(argv[2], "test")) test_yolo(cfg, weights, filename, thresh);
-    else if(0==strcmp(argv[2], "train")) train_yolo(cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) validate_yolo(cfg, weights);
-    else if(0==strcmp(argv[2], "recall")) validate_yolo_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, voc_names, 20, frame_skip, prefix, avg, .5, 0,0,0,0);
+    char *filename = (argc > 5) ? argv[5] : 0;
+    if (0 == strcmp(argv[2], "test")) test_yolo(cfg, weights, filename, thresh);
+    else if (0 == strcmp(argv[2], "train")) train_yolo(cfg, weights);
+    else if (0 == strcmp(argv[2], "valid")) validate_yolo(cfg, weights);
+    else if (0 == strcmp(argv[2], "recall")) validate_yolo_recall(cfg, weights);
+    else if (0 == strcmp(argv[2], "demo"))
+        demo(cfg, weights, thresh, cam_index, filename, voc_names, 20, frame_skip, prefix, avg, .5, 0, 0, 0, 0);
 }
--- a/include/darknet.h
+++ b/include/darknet.h
@ -1,37 +1,36 @@
 #ifndef DARKNET_API
 #define DARKNET_API
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <pthread.h>

 #ifdef GPU
-    #define BLOCK 512
+#define BLOCK 512

-    #include "cuda_runtime.h"
-    #include "curand.h"
-    #include "cublas_v2.h"
+#include "hip/hip_runtime.h"
+#include "hiprand.h"
+#include "hipblas.h"
+#include "cuda.h"

-    #ifdef CUDNN
-    #include "cudnn.h"
-    #endif
+#ifdef CUDNN
+#include "cudnn.h"
+#endif
 #endif

-#ifdef __cplusplus
-extern "C" {
-#endif

 #define SECRET_NUM -1234
 extern int gpu_index;

-typedef struct{
+typedef struct {
    int classes;
    char **names;
 } metadata;

 metadata get_metadata(char *file);

-typedef struct{
+typedef struct {
    int *leaf;
    int n;
    int *parent;
@ -43,17 +42,18 @@ typedef struct{
    int *group_size;
    int *group_offset;
 } tree;
+
 tree *read_tree(char *filename);

-typedef enum{
+typedef enum {
    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU
 } ACTIVATION;

-typedef enum{
+typedef enum {
    PNG, BMP, TGA, JPG
 } IMTYPE;

-typedef enum{
+typedef enum {
    MULT, ADD, SUB, DIV
 } BINARY_ACTIVATION;

@ -90,11 +90,11 @@ typedef enum {
    BLANK
 } LAYER_TYPE;

-typedef enum{
-    SSE, MASKED, L1, SEG, SMOOTH,WGAN
+typedef enum {
+    SSE, MASKED, L1, SEG, SMOOTH, WGAN
 } COST_TYPE;

-typedef struct{
+typedef struct {
    int batch;
    float learning_rate;
    float momentum;
@ -112,16 +112,23 @@ typedef struct network network;
 struct layer;
 typedef struct layer layer;

-struct layer{
+struct layer {
    LAYER_TYPE type;
    ACTIVATION activation;
    COST_TYPE cost_type;
-    void (*forward)   (struct layer, struct network);
-    void (*backward)  (struct layer, struct network);
-    void (*update)    (struct layer, update_args);
-    void (*forward_gpu)   (struct layer, struct network);
-    void (*backward_gpu)  (struct layer, struct network);
-    void (*update_gpu)    (struct layer, update_args);
+
+    void (*forward)(struct layer, struct network);
+
+    void (*backward)(struct layer, struct network);
+
+    void (*update)(struct layer, update_args);
+
+    void (*forward_gpu)(struct layer, struct network);
+
+    void (*backward_gpu)(struct layer, struct network);
+
+    void (*update_gpu)(struct layer, update_args);
+
    int batch_normalize;
    int shortcut;
    int batch;
@ -133,7 +140,7 @@ struct layer{
    int nbiases;
    int extra;
    int truths;
-    int h,w,c;
+    int h, w, c;
    int out_h, out_w, out_c;
    int n;
    int max_boxes;
@ -207,69 +214,69 @@ struct layer{
    float probability;
    float scale;

-    char  * cweights;
-    int   * indexes;
-    int   * input_layers;
-    int   * input_sizes;
-    int   * map;
-    int   * counts;
-    float ** sums;
-    float * rand;
-    float * cost;
-    float * state;
-    float * prev_state;
-    float * forgot_state;
-    float * forgot_delta;
-    float * state_delta;
-    float * combine_cpu;
-    float * combine_delta_cpu;
+    char *cweights;
+    int *indexes;
+    int *input_layers;
+    int *input_sizes;
+    int *map;
+    int *counts;
+    float **sums;
+    float *rand;
+    float *cost;
+    float *state;
+    float *prev_state;
+    float *forgot_state;
+    float *forgot_delta;
+    float *state_delta;
+    float *combine_cpu;
+    float *combine_delta_cpu;

-    float * concat;
-    float * concat_delta;
+    float *concat;
+    float *concat_delta;

-    float * binary_weights;
+    float *binary_weights;

-    float * biases;
-    float * bias_updates;
+    float *biases;
+    float *bias_updates;

-    float * scales;
-    float * scale_updates;
+    float *scales;
+    float *scale_updates;

-    float * weights;
-    float * weight_updates;
+    float *weights;
+    float *weight_updates;

-    float * delta;
-    float * output;
-    float * loss;
-    float * squared;
-    float * norms;
+    float *delta;
+    float *output;
+    float *loss;
+    float *squared;
+    float *norms;

-    float * spatial_mean;
-    float * mean;
-    float * variance;
+    float *spatial_mean;
+    float *mean;
+    float *variance;

-    float * mean_delta;
-    float * variance_delta;
+    float *mean_delta;
+    float *variance_delta;

-    float * rolling_mean;
-    float * rolling_variance;
+    float *rolling_mean;
+    float *rolling_variance;

-    float * x;
-    float * x_norm;
+    float *x;
+    float *x_norm;

-    float * m;
-    float * v;
-    
-    float * bias_m;
-    float * bias_v;
-    float * scale_m;
-    float * scale_v;
+    float *m;
+    float *v;
+
+    float *bias_m;
+    float *bias_v;
+    float *scale_m;
+    float *scale_v;


    float *z_cpu;
    float *r_cpu;
    float *h_cpu;
-    float * prev_state_cpu;
+    float *prev_state_cpu;

    float *temp_cpu;
    float *temp2_cpu;
@ -284,9 +291,9 @@ struct layer{
    float *g_cpu;
    float *o_cpu;
    float *c_cpu;
-    float *dc_cpu; 
+    float *dc_cpu;

-    float * binary_input;
+    float *binary_input;

    struct layer *input_layer;
    struct layer *self_layer;
@ -311,7 +318,7 @@ struct layer{

    struct layer *input_h_layer;
    struct layer *state_h_layer;
-	
+
    struct layer *wz;
    struct layer *uz;
    struct layer *wr;
@ -427,7 +434,7 @@ typedef enum {
    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM
 } learning_rate_policy;

-typedef struct network{
+typedef struct network {
    int n;
    int batch;
    size_t *seen;
@ -448,7 +455,7 @@ typedef struct network{
    int step;
    int max_batches;
    float *scales;
-    int   *steps;
+    int *steps;
    int num_steps;
    int burn_in;

@ -512,11 +519,11 @@ typedef struct {
    float *data;
 } image;

-typedef struct{
+typedef struct {
    float x, y, w, h;
 } box;

-typedef struct detection{
+typedef struct detection {
    box bbox;
    int classes;
    float *prob;
@ -525,13 +532,13 @@ typedef struct detection{
    int sort_class;
 } detection;

-typedef struct matrix{
+typedef struct matrix {
    int rows, cols;
    float **vals;
 } matrix;


-typedef struct{
+typedef struct {
    int w, h;
    matrix X;
    matrix y;
@ -541,10 +548,27 @@ typedef struct{
 } data;

 typedef enum {
-    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA, LETTERBOX_DATA, REGRESSION_DATA, SEGMENTATION_DATA, INSTANCE_DATA, ISEG_DATA
+    CLASSIFICATION_DATA,
+    DETECTION_DATA,
+    CAPTCHA_DATA,
+    REGION_DATA,
+    IMAGE_DATA,
+    COMPARE_DATA,
+    WRITING_DATA,
+    SWAG_DATA,
+    TAG_DATA,
+    OLD_CLASSIFICATION_DATA,
+    STUDY_DATA,
+    DET_DATA,
+    SUPER_DATA,
+    LETTERBOX_DATA,
+    REGRESSION_DATA,
+    SEGMENTATION_DATA,
+    INSTANCE_DATA,
+    ISEG_DATA
 } data_type;

-typedef struct load_args{
+typedef struct load_args {
    int threads;
    char **paths;
    char *path;
@ -577,52 +601,68 @@ typedef struct load_args{
    tree *hierarchy;
 } load_args;

-typedef struct{
+typedef struct {
    int id;
-    float x,y,w,h;
+    float x, y, w, h;
    float left, right, top, bottom;
 } box_label;


 network *load_network(char *cfg, char *weights, int clear);
+
 load_args get_base_args(network *net);

 void free_data(data d);

-typedef struct node{
+typedef struct node {
    void *val;
    struct node *next;
    struct node *prev;
 } node;

-typedef struct list{
+typedef struct list {
    int size;
    node *front;
    node *back;
 } list;

 pthread_t load_data(load_args args);
+
 list *read_data_cfg(char *filename);
+
 list *read_cfg(char *filename);
+
 unsigned char *read_file(char *filename);
+
 data resize_data(data orig, int w, int h);
+
 data *tile_data(data orig, int divs, int size);
+
 data select_data(data *orig, int *inds);

 void forward_network(network *net);
+
 void backward_network(network *net);
+
 void update_network(network *net);


 float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
+
 void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+
 void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
+
 void scal_cpu(int N, float ALPHA, float *X, int INCX);
-void fill_cpu(int N, float ALPHA, float * X, int INCX);
+
+void fill_cpu(int N, float ALPHA, float *X, int INCX);
+
 void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
+
 void softmax(float *input, int n, float temp, int stride, float *output);

 int best_3d_shift_r(image a, image b, int min, int max);
+
 #ifdef GPU
 void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
 void fill_gpu(int N, float ALPHA, float * X, int INCX);
@ -644,112 +684,204 @@ float train_networks(network **nets, int n, data d, int interval);
 void sync_nets(network **nets, int n, int interval);
 void harmless_update_network_gpu(network *net);
 #endif
+
 image get_label(image **characters, char *string, int size);
+
 void draw_label(image a, int r, int c, image label, const float *rgb);
+
 void save_image(image im, const char *name);
+
 void save_image_options(image im, const char *name, IMTYPE f, int quality);
+
 void get_next_batch(data d, int n, int offset, float *X, float *y);
+
 void grayscale_image_3c(image im);
+
 void normalize_image(image p);
+
 void matrix_to_csv(matrix m);
+
 float train_network_sgd(network *net, data d, int n);
+
 void rgbgr_image(image im);
+
 data copy_data(data d);
+
 data concat_data(data d1, data d2);
+
 data load_cifar10_data(char *filename);
+
 float matrix_topk_accuracy(matrix truth, matrix guess, int k);
+
 void matrix_add_matrix(matrix from, matrix to);
+
 void scale_matrix(matrix m, float scale);
+
 matrix csv_to_matrix(char *filename);
+
 float *network_accuracies(network *net, data d, int n);
+
 float train_network_datum(network *net);
+
 image make_random_image(int w, int h, int c);

 void denormalize_connected_layer(layer l);
+
 void denormalize_convolutional_layer(layer l);
+
 void statistics_connected_layer(layer l);
+
 void rescale_weights(layer l, float scale, float trans);
+
 void rgbgr_weights(layer l);
+
 image *get_weights(layer l);

-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, int avg, float hier_thresh, int w, int h, int fps, int fullscreen);
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes,
+          int frame_skip, char *prefix, int avg, float hier_thresh, int w, int h, int fps, int fullscreen);
+
 void get_detection_detections(layer l, int w, int h, float thresh, detection *dets);

 char *option_find_str(list *l, char *key, char *def);
+
 int option_find_int(list *l, char *key, int def);
+
 int option_find_int_quiet(list *l, char *key, int def);

 network *parse_network_cfg(char *filename);
+
 void save_weights(network *net, char *filename);
+
 void load_weights(network *net, char *filename);
+
 void save_weights_upto(network *net, char *filename, int cutoff);
+
 void load_weights_upto(network *net, char *filename, int start, int cutoff);

 void zero_objectness(layer l);
-void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets);
-int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets);
+
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh,
+                           int relative, detection *dets);
+
+int
+get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets);
+
 void free_network(network *net);
+
 void set_batch_network(network *net, int b);
+
 void set_temp_network(network *net, float t);
+
 image load_image(char *filename, int w, int h, int c);
+
 image load_image_color(char *filename, int w, int h);
+
 image make_image(int w, int h, int c);
+
 image resize_image(image im, int w, int h);
+
 void censor_image(image im, int dx, int dy, int w, int h);
+
 image letterbox_image(image im, int w, int h);
+
 image crop_image(image im, int dx, int dy, int w, int h);
+
 image center_crop_image(image im, int w, int h);
+
 image resize_min(image im, int min);
+
 image resize_max(image im, int max);
+
 image threshold_image(image im, float thresh);
+
 image mask_to_rgb(image mask);
+
 int resize_network(network *net, int w, int h);
+
 void free_matrix(matrix m);
+
 void test_resize(char *filename);
+
 int show_image(image p, const char *name, int ms);
+
 image copy_image(image p);
+
 void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, float g, float b);
+
 float get_current_rate(network *net);
+
 void composite_3d(char *f1, char *f2, char *out, int delta);
+
 data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
+
 size_t get_current_batch(network *net);
+
 void constrain_image(image im);
+
 image get_network_image_layer(network *net, int i);
+
 layer get_network_output_layer(network *net);
+
 void top_predictions(network *net, int n, int *index);
+
 void flip_image(image a);
+
 image float_to_image(int w, int h, int c, float *data);
+
 void ghost_image(image source, image dest, int dx, int dy);
+
 float network_accuracy(network *net, data d);
+
 void random_distort_image(image im, float hue, float saturation, float exposure);
+
 void fill_image(image m, float s);
+
 image grayscale_image(image im);
+
 void rotate_image_cw(image im, int times);
+
 double what_time_is_it_now();
+
 image rotate_image(image m, float rad);
+
 void visualize_network(network *net);
+
 float box_iou(box a, box b);
+
 data load_all_cifar10();
+
 box_label *read_boxes(char *filename, int *n);
+
 box float_to_box(float *f, int stride);
+
 void draw_detections(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes);

 matrix network_predict_data(network *net, data test);
+
 image **load_alphabet();
+
 image get_network_image(network *net);
+
 float *network_predict(network *net, float *input);

 int network_width(network *net);
+
 int network_height(network *net);
+
 float *network_predict_image(network *net, image im);
+
 void network_detect(network *net, image im, float thresh, float hier_thresh, float nms, detection *dets);
+
 detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num);
+
 void free_detections(detection *dets, int n);

 void reset_network_state(network *net, int b);

 char **get_labels(char *filename);
+
 void do_nms_obj(detection *dets, int total, int classes, float thresh);
+
 void do_nms_sort(detection *dets, int total, int classes, float thresh);

 matrix make_matrix(int rows, int cols);
@ -761,45 +893,77 @@ void make_window(char *name, int w, int h, int fullscreen);
 #endif

 void free_image(image m);
+
 float train_network(network *net, data d);
+
 pthread_t load_data_in_thread(load_args args);
+
 void load_data_blocking(load_args args);
+
 list *get_paths(char *filename);
+
 void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride);
+
 void change_leaves(tree *t, char *leaf_list);

 int find_int_arg(int argc, char **argv, char *arg, int def);
+
 float find_float_arg(int argc, char **argv, char *arg, float def);
-int find_arg(int argc, char* argv[], char *arg);
+
+int find_arg(int argc, char *argv[], char *arg);
+
 char *find_char_arg(int argc, char **argv, char *arg, char *def);
+
 char *basecfg(char *cfgfile);
+
 void find_replace(char *str, char *orig, char *rep, char *output);
+
 void free_ptrs(void **ptrs, int n);
+
 char *fgetl(FILE *fp);
+
 void strip(char *s);
+
 float sec(clock_t clocks);
+
 void **list_to_array(list *l);
+
 void top_k(float *a, int n, int k, int *index);
+
 int *read_map(char *filename);
+
 void error(const char *s);
+
 int max_index(float *a, int n);
+
 int max_int_index(int *a, int n);
+
 int sample_array(float *a, int n);
+
 int *random_index_order(int min, int max);
+
 void free_list(list *l);
+
 float mse_array(float *a, int n);
+
 float variance_array(float *a, int n);
+
 float mag_array(float *a, int n);
+
 void scale_array(float *a, int n, float s);
+
 float mean_array(float *a, int n);
+
 float sum_array(float *a, int n);
+
 void normalize_array(float *a, int n);
+
 int *read_intlist(char *s, int *n, int d);
+
 size_t rand_size_t();
+
 float rand_normal();
+
 float rand_uniform(float min, float max);

-#ifdef __cplusplus
-}
-#endif
 #endif
--- a/python/darknet_cpp.py
+++ b/python/darknet_cpp.py
@ -0,0 +1,183 @@
+import os
+from ctypes import *
+import math
+import random
+
+#
+# def sample(probs):
+#     s = sum(probs)
+#     probs = [a / s for a in probs]
+#     r = random.uniform(0, 1)
+#     for i in range(len(probs)):
+#         r = r - probs[i]
+#         if r <= 0:
+#             return i
+#     return len(probs) - 1
+#
+#
+# def c_array(ctype, values):
+#     arr = (ctype * len(values))()
+#     arr[:] = values
+#     return arr
+#
+#
+# class BOX(Structure):
+#     _fields_ = [("x", c_float),
+#                 ("y", c_float),
+#                 ("w", c_float),
+#                 ("h", c_float)]
+#
+#
+# class DETECTION(Structure):
+#     _fields_ = [("bbox", BOX),
+#                 ("classes", c_int),
+#                 ("prob", POINTER(c_float)),
+#                 ("mask", POINTER(c_float)),
+#                 ("objectness", c_float),
+#                 ("sort_class", c_int)]
+#
+#
+# class IMAGE(Structure):
+#     _fields_ = [("w", c_int),
+#                 ("h", c_int),
+#                 ("c", c_int),
+#                 ("data", POINTER(c_float))]
+#
+#
+# class METADATA(Structure):
+#     _fields_ = [("classes", c_int),
+#                 ("names", POINTER(c_char_p))]
+#
+#
+# def main():
+#     # lib = CDLL("/home/pjreddie/documents/darknet/libdarknet.so", RTLD_GLOBAL)
+#     lib = CDLL("libdarknet.so", RTLD_GLOBAL)
+#     lib.network_width.argtypes = [c_void_p]
+#     lib.network_width.restype = c_int
+#     lib.network_height.argtypes = [c_void_p]
+#     lib.network_height.restype = c_int
+#
+#     predict = lib.network_predict
+#     predict.argtypes = [c_void_p, POINTER(c_float)]
+#     predict.restype = POINTER(c_float)
+#
+#     set_gpu = lib.cuda_set_device
+#     set_gpu.argtypes = [c_int]
+#
+#     make_image = lib.make_image
+#     make_image.argtypes = [c_int, c_int, c_int]
+#     make_image.restype = IMAGE
+#
+#     get_network_boxes = lib.get_network_boxes
+#     get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int)]
+#     get_network_boxes.restype = POINTER(DETECTION)
+#
+#     make_network_boxes = lib.make_network_boxes
+#     make_network_boxes.argtypes = [c_void_p]
+#     make_network_boxes.restype = POINTER(DETECTION)
+#
+#     free_detections = lib.free_detections
+#     free_detections.argtypes = [POINTER(DETECTION), c_int]
+#
+#     free_ptrs = lib.free_ptrs
+#     free_ptrs.argtypes = [POINTER(c_void_p), c_int]
+#
+#     network_predict = lib.network_predict
+#     network_predict.argtypes = [c_void_p, POINTER(c_float)]
+#
+#     reset_rnn = lib.reset_rnn
+#     reset_rnn.argtypes = [c_void_p]
+#
+#     load_net = lib.load_network
+#     load_net.argtypes = [c_char_p, c_char_p, c_int]
+#     load_net.restype = c_void_p
+#
+#     do_nms_obj = lib.do_nms_obj
+#     do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+#
+#     do_nms_sort = lib.do_nms_sort
+#     do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+#
+#     free_image = lib.free_image
+#     free_image.argtypes = [IMAGE]
+#
+#     letterbox_image = lib.letterbox_image
+#     letterbox_image.argtypes = [IMAGE, c_int, c_int]
+#     letterbox_image.restype = IMAGE
+#
+#     load_meta = lib.get_metadata
+#     lib.get_metadata.argtypes = [c_char_p]
+#     lib.get_metadata.restype = METADATA
+#
+#     load_image = lib.load_image_color
+#     load_image.argtypes = [c_char_p, c_int, c_int]
+#     load_image.restype = IMAGE
+#
+#     rgbgr_image = lib.rgbgr_image
+#     rgbgr_image.argtypes = [IMAGE]
+#
+#     predict_image = lib.network_predict_image
+#     predict_image.argtypes = [c_void_p, IMAGE]
+#     predict_image.restype = POINTER(c_float)
+#
+#
+# def classify(net, meta, im):
+#     out = predict_image(net, im)
+#     res = []
+#     for i in range(meta.classes):
+#         res.append((meta.names[i], out[i]))
+#     res = sorted(res, key=lambda x: -x[1])
+#     return res
+#
+#
+# def detect(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
+#     im = load_image(image, 0, 0)
+#     num = c_int(0)
+#     pnum = pointer(num)
+#     predict_image(net, im)
+#     dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
+#     num = pnum[0]
+#     if (nms): do_nms_obj(dets, num, meta.classes, nms);
+#
+#     res = []
+#     for j in range(num):
+#         for i in range(meta.classes):
+#             if dets[j].prob[i] > 0:
+#                 b = dets[j].bbox
+#                 res.append((meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)))
+#     res = sorted(res, key=lambda x: -x[1])
+#     free_image(im)
+#     free_detections(dets, num)
+#     return res
+#
+
+# def main():
+#     import os
+
+
+if __name__ == "__main__":
+    # get exe file path, need to copy dynamic library to the same directory before.
+    # absfilepath = os.path.dirname(__file__)
+    # dylib_path = os.path.join(absfilepath, "libdarknet.so")
+    #
+    lib = CDLL("libdarknet.so", RTLD_GLOBAL)
+    lib._Z13network_widthP7network.argtypes = [c_void_p]
+    lib._Z13network_widthP7network.restype = c_int
+    lib._Z13network_widthP7network.argtypes = [c_void_p]
+    lib._Z13network_widthP7network.restype = c_int
+
+    # predict = lib.network_predict
+    predict = lib._Z26network_predict_data_multiP7network4datai
+    predict.argtypes = [c_void_p, POINTER(c_float)]
+    predict.restype = POINTER(c_float)
+    print("predict ", predict)
+
+# # net = load_net("cfg/densenet201.cfg", "/home/pjreddie/trained/densenet201.weights", 0)
+# # im = load_image("data/wolf.jpg", 0, 0)
+# # meta = load_meta("cfg/imagenet1k.data")
+# # r = classify(net, meta, im)
+# # print r[:10]
+# net = load_net("cfg/tiny-yolo.cfg", "tiny-yolo.weights", 0)
+# meta = load_meta("cfg/coco.data")
+# r = detect(net, meta, "data/dog.jpg")
+# print(r)
--- a/python/proverbot.py
+++ b/python/proverbot.py
@ -1,37 +0,0 @@
-from darknet import *
-
-def predict_tactic(net, s):
-    prob = 0
-    d = c_array(c_float, [0.0]*256)
-    tac = ''
-    if not len(s):
-        s = '\n'
-    for c in s[:-1]:
-        d[ord(c)] = 1
-        pred = predict(net, d)
-        d[ord(c)] = 0
-    c = s[-1]
-    while 1:
-        d[ord(c)] = 1
-        pred = predict(net, d)
-        d[ord(c)] = 0
-        pred = [pred[i] for i in range(256)]
-        ind = sample(pred)
-        c = chr(ind)
-        prob += math.log(pred[ind])
-        if len(tac) and tac[-1] == '.':
-            break
-        tac = tac + c
-    return (tac, prob)
-
-def predict_tactics(net, s, n):
-    tacs = []
-    for i in range(n):
-        reset_rnn(net)
-        tacs.append(predict_tactic(net, s))
-    tacs = sorted(tacs, key=lambda x: -x[1])
-    return tacs
-
-net = load_net("cfg/coq.test.cfg", "/home/pjreddie/backup/coq.backup", 0)
-t = predict_tactics(net, "+++++\n", 10)
-print t
--- a/scripts/voc_label.py
+++ b/scripts/voc_label.py
@ -4,28 +4,30 @@ import os
 from os import listdir, getcwd
 from os.path import join

-sets=[('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')]
+sets = [('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')]

-classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
+           "sofa", "train", "tvmonitor"]


 def convert(size, box):
-    dw = 1./(size[0])
-    dh = 1./(size[1])
-    x = (box[0] + box[1])/2.0 - 1
-    y = (box[2] + box[3])/2.0 - 1
+    dw = 1. / (size[0])
+    dh = 1. / (size[1])
+    x = (box[0] + box[1]) / 2.0 - 1
+    y = (box[2] + box[3]) / 2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
-    x = x*dw
-    w = w*dw
-    y = y*dh
-    h = h*dh
-    return (x,y,w,h)
+    x = x * dw
+    w = w * dw
+    y = y * dh
+    h = h * dh
+    return (x, y, w, h)
+

 def convert_annotation(year, image_id):
-    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id))
-    out_file = open('VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w')
-    tree=ET.parse(in_file)
+    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml' % (year, image_id))
+    out_file = open('VOCdevkit/VOC%s/labels/%s.txt' % (year, image_id), 'w')
+    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
@ -34,26 +36,26 @@ def convert_annotation(year, image_id):
    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
-        if cls not in classes or int(difficult)==1:
+        if cls not in classes or int(difficult) == 1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
-        bb = convert((w,h), b)
+        bb = convert((w, h), b)
        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')

+
 wd = getcwd()

 for year, image_set in sets:
-    if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)):
-        os.makedirs('VOCdevkit/VOC%s/labels/'%(year))
-    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split()
-    list_file = open('%s_%s.txt'%(year, image_set), 'w')
+    if not os.path.exists('VOCdevkit/VOC%s/labels/' % (year)):
+        os.makedirs('VOCdevkit/VOC%s/labels/' % (year))
+    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt' % (year, image_set)).read().strip().split()
+    list_file = open('%s_%s.txt' % (year, image_set), 'w')
    for image_id in image_ids:
-        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n'%(wd, year, image_id))
+        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n' % (wd, year, image_id))
        convert_annotation(year, image_id)
    list_file.close()

 os.system("cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt > train.txt")
 os.system("cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt > train.all.txt")
-
--- a/scripts/voc_label.py.old
+++ b/scripts/voc_label.py.old
@ -0,0 +1,61 @@
+import xml.etree.ElementTree as ET
+import pickle
+import os
+from os import listdir, getcwd
+from os.path import join
+
+sets = [('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')]
+
+classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
+           "sofa", "train", "tvmonitor"]
+
+
+def convert(size, box):
+    dw = 1. / (size[0])
+    dh = 1. / (size[1])
+    x = (box[0] + box[1]) / 2.0 - 1
+    y = (box[2] + box[3]) / 2.0 - 1
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = x * dw
+    w = w * dw
+    y = y * dh
+    h = h * dh
+    return (x, y, w, h)
+
+
+def convert_annotation(year, image_id):
+    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml' % (year, image_id))
+    out_file = open('VOCdevkit/VOC%s/labels/%s.txt' % (year, image_id), 'w')
+    tree = ET.parse(in_file)
+    root = tree.getroot()
+    size = root.find('size')
+    w = int(size.find('width').text)
+    h = int(size.find('height').text)
+
+    for obj in root.iter('object'):
+        difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult) == 1:
+            continue
+        cls_id = classes.index(cls)
+        xmlbox = obj.find('bndbox')
+        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
+        bb = convert((w, h), b)
+        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
+
+
+wd = getcwd()
+
+for year, image_set in sets:
+    if not os.path.exists('VOCdevkit/VOC%s/labels/' % (year)):
+        os.makedirs('VOCdevkit/VOC%s/labels/' % (year))
+    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt' % (year, image_set)).read().strip().split()
+    list_file = open('%s_%s.txt' % (year, image_set), 'w')
+    for image_id in image_ids:
+        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n' % (wd, year, image_id))
+        convert_annotation(year, image_id)
+    list_file.close()
+
+os.system("cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt > train.txt")
+os.system("cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt > train.all.txt")
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@ -1,84 +1,98 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
+#include "hip/hip_runtime.h"
+#include "hiprand.h"
+#include "hipblas.h"

-extern "C" {
 #include "activations.h"
-#include "cuda.h"
-}

-
-__device__ float lhtan_activate_kernel(float x)
-{
-    if(x < 0) return .001f*x;
-    if(x > 1) return .001f*(x-1.f) + 1.f;
+__device__ float lhtan_activate_kernel(float x) {
+    if (x < 0) return .001f * x;
+    if (x > 1) return .001f * (x - 1.f) + 1.f;
    return x;
 }
-__device__ float lhtan_gradient_kernel(float x)
-{
-    if(x > 0 && x < 1) return 1;
+
+__device__ float lhtan_gradient_kernel(float x) {
+    if (x > 0 && x < 1) return 1;
    return .001;
 }

-__device__ float hardtan_activate_kernel(float x)
-{
+__device__ float hardtan_activate_kernel(float x) {
    if (x < -1) return -1;
    if (x > 1) return 1;
    return x;
 }
-__device__ float linear_activate_kernel(float x){return x;}
-__device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));}
-__device__ float loggy_activate_kernel(float x){return 2.f/(1.f + expf(-x)) - 1;}
-__device__ float relu_activate_kernel(float x){return x*(x>0);}
-__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);}
-__device__ float selu_activate_kernel(float x){return (x >= 0)*1.0507f*x + (x < 0)*1.0507f*1.6732f*(expf(x)-1);}
-__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;}
-__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;}
-__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;}
-__device__ float tanh_activate_kernel(float x){return (2.f/(1 + expf(-2*x)) - 1);}
-__device__ float plse_activate_kernel(float x)
-{
-    if(x < -4) return .01f * (x + 4);
-    if(x > 4)  return .01f * (x - 4) + 1;
-    return .125f*x + .5f;
-}
-__device__ float stair_activate_kernel(float x)
-{
-    int n = floorf(x);
-    if (n%2 == 0) return floorf(x/2);
-    else return (x - n) + floorf(x/2);
-}
- 

-__device__ float hardtan_gradient_kernel(float x)
-{
+__device__ float linear_activate_kernel(float x) { return x; }
+
+__device__ float logistic_activate_kernel(float x) { return 1.f / (1.f + expf(-x)); }
+
+__device__ float loggy_activate_kernel(float x) { return 2.f / (1.f + expf(-x)) - 1; }
+
+__device__ float relu_activate_kernel(float x) { return x * (x > 0); }
+
+__device__ float elu_activate_kernel(float x) { return (x >= 0) * x + (x < 0) * (expf(x) - 1); }
+
+__device__ float selu_activate_kernel(float x) {
+    return (x >= 0) * 1.0507f * x + (x < 0) * 1.0507f * 1.6732f * (expf(x) - 1);
+}
+
+__device__ float relie_activate_kernel(float x) { return (x > 0) ? x : .01f * x; }
+
+__device__ float ramp_activate_kernel(float x) { return x * (x > 0) + .1f * x; }
+
+__device__ float leaky_activate_kernel(float x) { return (x > 0) ? x : .1f * x; }
+
+__device__ float tanh_activate_kernel(float x) { return (2.f / (1 + expf(-2 * x)) - 1); }
+
+__device__ float plse_activate_kernel(float x) {
+    if (x < -4) return .01f * (x + 4);
+    if (x > 4) return .01f * (x - 4) + 1;
+    return .125f * x + .5f;
+}
+
+__device__ float stair_activate_kernel(float x) {
+    int n = floorf(x);
+    if (n % 2 == 0) return floorf(x / 2);
+    else return (x - n) + floorf(x / 2);
+}
+
+
+__device__ float hardtan_gradient_kernel(float x) {
    if (x > -1 && x < 1) return 1;
    return 0;
 }
-__device__ float linear_gradient_kernel(float x){return 1;}
-__device__ float logistic_gradient_kernel(float x){return (1-x)*x;}
-__device__ float loggy_gradient_kernel(float x)
-{
-    float y = (x+1)/2;
-    return 2*(1-y)*y;
+
+__device__ float linear_gradient_kernel(float x) { return 1; }
+
+__device__ float logistic_gradient_kernel(float x) { return (1 - x) * x; }
+
+__device__ float loggy_gradient_kernel(float x) {
+    float y = (x + 1) / 2;
+    return 2 * (1 - y) * y;
 }
-__device__ float relu_gradient_kernel(float x){return (x>0);}
-__device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);}
-__device__ float selu_gradient_kernel(float x){return (x >= 0)*1.0507 + (x < 0)*(x + 1.0507*1.6732);}
-__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01f;}
-__device__ float ramp_gradient_kernel(float x){return (x>0)+.1f;}
-__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1f;}
-__device__ float tanh_gradient_kernel(float x){return 1-x*x;}
-__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;}
-__device__ float stair_gradient_kernel(float x)
-{
+
+__device__ float relu_gradient_kernel(float x) { return (x > 0); }
+
+__device__ float elu_gradient_kernel(float x) { return (x >= 0) + (x < 0) * (x + 1); }
+
+__device__ float selu_gradient_kernel(float x) { return (x >= 0) * 1.0507 + (x < 0) * (x + 1.0507 * 1.6732); }
+
+__device__ float relie_gradient_kernel(float x) { return (x > 0) ? 1 : .01f; }
+
+__device__ float ramp_gradient_kernel(float x) { return (x > 0) + .1f; }
+
+__device__ float leaky_gradient_kernel(float x) { return (x > 0) ? 1 : .1f; }
+
+__device__ float tanh_gradient_kernel(float x) { return 1 - x * x; }
+
+__device__ float plse_gradient_kernel(float x) { return (x < 0 || x > 1) ? .01f : .125f; }
+
+__device__ float stair_gradient_kernel(float x) {
    if (floorf(x) == x) return 0;
    return 1;
 }

-__device__ float activate_kernel(float x, ACTIVATION a)
-{
-    switch(a){
+__device__ float activate_kernel(float x, ACTIVATION a) {
+    switch (a) {
        case LINEAR:
            return linear_activate_kernel(x);
        case LOGISTIC:
@ -111,9 +125,8 @@ __device__ float activate_kernel(float x, ACTIVATION a)
    return 0;
 }

-__device__ float gradient_kernel(float x, ACTIVATION a)
-{
-    switch(a){
+__device__ float gradient_kernel(float x, ACTIVATION a) {
+    switch (a) {
        case LINEAR:
            return linear_gradient_kernel(x);
        case LOGISTIC:
@ -146,61 +159,54 @@ __device__ float gradient_kernel(float x, ACTIVATION a)
    return 0;
 }

-__global__ void binary_gradient_array_kernel(float *x, float *dy, int n, int s, BINARY_ACTIVATION a, float *dx)
-{
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+__global__ void binary_gradient_array_kernel(float *x, float *dy, int n, int s, BINARY_ACTIVATION a, float *dx) {
+    int id = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
    int i = id % s;
    int b = id / s;
-    float x1 = x[b*s + i];
-    float x2 = x[b*s + s/2 + i];
-    if(id < n) {
+    float x1 = x[b * s + i];
+    float x2 = x[b * s + s / 2 + i];
+    if (id < n) {
        float de = dy[id];
-        dx[b*s + i] = x2*de;
-        dx[b*s + s/2 + i] = x1*de; 
+        dx[b * s + i] = x2 * de;
+        dx[b * s + s / 2 + i] = x1 * de;
    }
 }

-extern "C" void binary_gradient_array_gpu(float *x, float *dx, int n, int size, BINARY_ACTIVATION a, float *y) 
-{
-    binary_gradient_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, dx, n/2, size, a, y);
-    check_error(cudaPeekAtLastError());
+void binary_gradient_array_gpu(float *x, float *dx, int n, int size, BINARY_ACTIVATION a, float *y) {
+    binary_gradient_array_kernel<<<cuda_gridsize(n / 2), BLOCK>>>(x, dx, n / 2, size, a, y);
+    check_error(hipPeekAtLastError());
 }
-__global__ void binary_activate_array_kernel(float *x, int n, int s, BINARY_ACTIVATION a, float *y)
-{
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+
+__global__ void binary_activate_array_kernel(float *x, int n, int s, BINARY_ACTIVATION a, float *y) {
+    int id = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
    int i = id % s;
    int b = id / s;
-    float x1 = x[b*s + i];
-    float x2 = x[b*s + s/2 + i];
-    if(id < n) y[id] = x1*x2;
+    float x1 = x[b * s + i];
+    float x2 = x[b * s + s / 2 + i];
+    if (id < n) y[id] = x1 * x2;
 }

-extern "C" void binary_activate_array_gpu(float *x, int n, int size, BINARY_ACTIVATION a, float *y) 
-{
-    binary_activate_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, n/2, size, a, y);
-    check_error(cudaPeekAtLastError());
+void binary_activate_array_gpu(float *x, int n, int size, BINARY_ACTIVATION a, float *y) {
+    binary_activate_array_kernel<<<cuda_gridsize(n / 2), BLOCK>>>(x, n / 2, size, a, y);
+    check_error(hipPeekAtLastError());
 }

-__global__ void activate_array_kernel(float *x, int n, ACTIVATION a)
-{
-    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < n) x[i] = activate_kernel(x[i], a);
+__global__ void activate_array_kernel(float *x, int n, ACTIVATION a) {
+    int i = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) x[i] = activate_kernel(x[i], a);
 }

-__global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delta)
-{
-    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < n) delta[i] *= gradient_kernel(x[i], a);
+__global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delta) {
+    int i = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) delta[i] *= gradient_kernel(x[i], a);
 }

-extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a) 
-{
+void activate_array_gpu(float *x, int n, ACTIVATION a) {
    activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a);
-    check_error(cudaPeekAtLastError());
+    check_error(hipPeekAtLastError());
 }

-extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta) 
-{
+void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta) {
    gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta);
-    check_error(cudaPeekAtLastError());
+    check_error(hipPeekAtLastError());
 }
--- a/src/activation_layer.cpp
+++ b/src/activation_layer.cpp
@ -1,6 +1,5 @@
 #include "activation_layer.h"
 #include "utils.h"
-#include "cuda.h"
 #include "blas.h"
 #include "gemm.h"

@ -9,17 +8,20 @@
 #include <stdlib.h>
 #include <string.h>

-layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
-{
-    layer l = {0};
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+layer make_activation_layer(int batch, int inputs, ACTIVATION activation) {
+    layer l = {(LAYER_TYPE)0};
    l.type = ACTIVE;

    l.inputs = inputs;
    l.outputs = inputs;
-    l.batch=batch;
+    l.batch = batch;

-    l.output = calloc(batch*inputs, sizeof(float*));
-    l.delta = calloc(batch*inputs, sizeof(float*));
+    l.output = (float*) calloc(batch * inputs, sizeof(float *));
+    l.delta = (float*) calloc(batch * inputs, sizeof(float *));

    l.forward = forward_activation_layer;
    l.backward = backward_activation_layer;
@ -35,16 +37,14 @@ layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
    return l;
 }

-void forward_activation_layer(layer l, network net)
-{
-    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
-    activate_array(l.output, l.outputs*l.batch, l.activation);
+void forward_activation_layer(layer l, network net) {
+    copy_cpu(l.outputs * l.batch, net.input, 1, l.output, 1);
+    activate_array(l.output, l.outputs * l.batch, l.activation);
 }

-void backward_activation_layer(layer l, network net)
-{
-    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-    copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
+void backward_activation_layer(layer l, network net) {
+    gradient_array(l.output, l.outputs * l.batch, l.activation, l.delta);
+    copy_cpu(l.outputs * l.batch, l.delta, 1, net.delta, 1);
 }

 #ifdef GPU
--- a/src/activation_layer.h
+++ b/src/activation_layer.h
@ -8,6 +8,7 @@
 layer make_activation_layer(int batch, int inputs, ACTIVATION activation);

 void forward_activation_layer(layer l, network net);
+
 void backward_activation_layer(layer l, network net);

 #ifdef GPU
--- a/src/activations.cpp
+++ b/src/activations.cpp
@ -5,9 +5,8 @@
 #include <stdlib.h>
 #include <string.h>

-char *get_activation_string(ACTIVATION a)
-{
-    switch(a){
+char *get_activation_string(ACTIVATION a) {
+    switch (a) {
        case LOGISTIC:
            return "logistic";
        case LOGGY:
@ -42,29 +41,27 @@ char *get_activation_string(ACTIVATION a)
    return "relu";
 }

-ACTIVATION get_activation(char *s)
-{
-    if (strcmp(s, "logistic")==0) return LOGISTIC;
-    if (strcmp(s, "loggy")==0) return LOGGY;
-    if (strcmp(s, "relu")==0) return RELU;
-    if (strcmp(s, "elu")==0) return ELU;
-    if (strcmp(s, "selu")==0) return SELU;
-    if (strcmp(s, "relie")==0) return RELIE;
-    if (strcmp(s, "plse")==0) return PLSE;
-    if (strcmp(s, "hardtan")==0) return HARDTAN;
-    if (strcmp(s, "lhtan")==0) return LHTAN;
-    if (strcmp(s, "linear")==0) return LINEAR;
-    if (strcmp(s, "ramp")==0) return RAMP;
-    if (strcmp(s, "leaky")==0) return LEAKY;
-    if (strcmp(s, "tanh")==0) return TANH;
-    if (strcmp(s, "stair")==0) return STAIR;
+ACTIVATION get_activation(char *s) {
+    if (strcmp(s, "logistic") == 0) return LOGISTIC;
+    if (strcmp(s, "loggy") == 0) return LOGGY;
+    if (strcmp(s, "relu") == 0) return RELU;
+    if (strcmp(s, "elu") == 0) return ELU;
+    if (strcmp(s, "selu") == 0) return SELU;
+    if (strcmp(s, "relie") == 0) return RELIE;
+    if (strcmp(s, "plse") == 0) return PLSE;
+    if (strcmp(s, "hardtan") == 0) return HARDTAN;
+    if (strcmp(s, "lhtan") == 0) return LHTAN;
+    if (strcmp(s, "linear") == 0) return LINEAR;
+    if (strcmp(s, "ramp") == 0) return RAMP;
+    if (strcmp(s, "leaky") == 0) return LEAKY;
+    if (strcmp(s, "tanh") == 0) return TANH;
+    if (strcmp(s, "stair") == 0) return STAIR;
    fprintf(stderr, "Couldn't find activation function %s, going with ReLU\n", s);
    return RELU;
 }

-float activate(float x, ACTIVATION a)
-{
-    switch(a){
+float activate(float x, ACTIVATION a) {
+    switch (a) {
        case LINEAR:
            return linear_activate(x);
        case LOGISTIC:
@ -97,17 +94,15 @@ float activate(float x, ACTIVATION a)
    return 0;
 }

-void activate_array(float *x, const int n, const ACTIVATION a)
-{
+void activate_array(float *x, const int n, const ACTIVATION a) {
    int i;
-    for(i = 0; i < n; ++i){
+    for (i = 0; i < n; ++i) {
        x[i] = activate(x[i], a);
    }
 }

-float gradient(float x, ACTIVATION a)
-{
-    switch(a){
+float gradient(float x, ACTIVATION a) {
+    switch (a) {
        case LINEAR:
            return linear_gradient(x);
        case LOGISTIC:
@ -140,10 +135,9 @@ float gradient(float x, ACTIVATION a)
    return 0;
 }

-void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta)
-{
+void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta) {
    int i;
-    for(i = 0; i < n; ++i){
+    for (i = 0; i < n; ++i) {
        delta[i] *= gradient(x[i], a);
    }
 } 
--- a/src/activations.h
+++ b/src/activations.h
@ -1,87 +1,111 @@
 #ifndef ACTIVATIONS_H
 #define ACTIVATIONS_H
+
 #include "darknet.h"
-#include "cuda.h"
 #include "math.h"

 ACTIVATION get_activation(char *s);

 char *get_activation_string(ACTIVATION a);
+
 float activate(float x, ACTIVATION a);
+
 float gradient(float x, ACTIVATION a);
+
 void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
+
 void activate_array(float *x, const int n, const ACTIVATION a);
+
 #ifdef GPU
+#include "hip/hip_runtime.h"
+
 void activate_array_gpu(float *x, int n, ACTIVATION a);
 void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta);
 #endif

-static inline float stair_activate(float x)
-{
+static inline float stair_activate(float x) {
    int n = floor(x);
-    if (n%2 == 0) return floor(x/2.);
-    else return (x - n) + floor(x/2.);
+    if (n % 2 == 0) return floor(x / 2.);
+    else return (x - n) + floor(x / 2.);
 }
-static inline float hardtan_activate(float x)
-{
+
+static inline float hardtan_activate(float x) {
    if (x < -1) return -1;
    if (x > 1) return 1;
    return x;
 }
-static inline float linear_activate(float x){return x;}
-static inline float logistic_activate(float x){return 1./(1. + exp(-x));}
-static inline float loggy_activate(float x){return 2./(1. + exp(-x)) - 1;}
-static inline float relu_activate(float x){return x*(x>0);}
-static inline float elu_activate(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
-static inline float selu_activate(float x){return (x >= 0)*1.0507*x + (x < 0)*1.0507*1.6732*(exp(x)-1);}
-static inline float relie_activate(float x){return (x>0) ? x : .01*x;}
-static inline float ramp_activate(float x){return x*(x>0)+.1*x;}
-static inline float leaky_activate(float x){return (x>0) ? x : .1*x;}
-static inline float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}
-static inline float plse_activate(float x)
-{
-    if(x < -4) return .01 * (x + 4);
-    if(x > 4)  return .01 * (x - 4) + 1;
-    return .125*x + .5;
+
+static inline float linear_activate(float x) { return x; }
+
+static inline float logistic_activate(float x) { return 1. / (1. + exp(-x)); }
+
+static inline float loggy_activate(float x) { return 2. / (1. + exp(-x)) - 1; }
+
+static inline float relu_activate(float x) { return x * (x > 0); }
+
+static inline float elu_activate(float x) { return (x >= 0) * x + (x < 0) * (exp(x) - 1); }
+
+static inline float selu_activate(float x) { return (x >= 0) * 1.0507 * x + (x < 0) * 1.0507 * 1.6732 * (exp(x) - 1); }
+
+static inline float relie_activate(float x) { return (x > 0) ? x : .01 * x; }
+
+static inline float ramp_activate(float x) { return x * (x > 0) + .1 * x; }
+
+static inline float leaky_activate(float x) { return (x > 0) ? x : .1 * x; }
+
+static inline float tanh_activate(float x) { return (exp(2 * x) - 1) / (exp(2 * x) + 1); }
+
+static inline float plse_activate(float x) {
+    if (x < -4) return .01 * (x + 4);
+    if (x > 4) return .01 * (x - 4) + 1;
+    return .125 * x + .5;
 }

-static inline float lhtan_activate(float x)
-{
-    if(x < 0) return .001*x;
-    if(x > 1) return .001*(x-1) + 1;
+static inline float lhtan_activate(float x) {
+    if (x < 0) return .001 * x;
+    if (x > 1) return .001 * (x - 1) + 1;
    return x;
 }
-static inline float lhtan_gradient(float x)
-{
-    if(x > 0 && x < 1) return 1;
+
+static inline float lhtan_gradient(float x) {
+    if (x > 0 && x < 1) return 1;
    return .001;
 }

-static inline float hardtan_gradient(float x)
-{
+static inline float hardtan_gradient(float x) {
    if (x > -1 && x < 1) return 1;
    return 0;
 }
-static inline float linear_gradient(float x){return 1;}
-static inline float logistic_gradient(float x){return (1-x)*x;}
-static inline float loggy_gradient(float x)
-{
-    float y = (x+1.)/2.;
-    return 2*(1-y)*y;
+
+static inline float linear_gradient(float x) { return 1; }
+
+static inline float logistic_gradient(float x) { return (1 - x) * x; }
+
+static inline float loggy_gradient(float x) {
+    float y = (x + 1.) / 2.;
+    return 2 * (1 - y) * y;
 }
-static inline float stair_gradient(float x)
-{
+
+static inline float stair_gradient(float x) {
    if (floor(x) == x) return 0;
    return 1;
 }
-static inline float relu_gradient(float x){return (x>0);}
-static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);}
-static inline float selu_gradient(float x){return (x >= 0)*1.0507 + (x < 0)*(x + 1.0507*1.6732);}
-static inline float relie_gradient(float x){return (x>0) ? 1 : .01;}
-static inline float ramp_gradient(float x){return (x>0)+.1;}
-static inline float leaky_gradient(float x){return (x>0) ? 1 : .1;}
-static inline float tanh_gradient(float x){return 1-x*x;}
-static inline float plse_gradient(float x){return (x < 0 || x > 1) ? .01 : .125;}
+
+static inline float relu_gradient(float x) { return (x > 0); }
+
+static inline float elu_gradient(float x) { return (x >= 0) + (x < 0) * (x + 1); }
+
+static inline float selu_gradient(float x) { return (x >= 0) * 1.0507 + (x < 0) * (x + 1.0507 * 1.6732); }
+
+static inline float relie_gradient(float x) { return (x > 0) ? 1 : .01; }
+
+static inline float ramp_gradient(float x) { return (x > 0) + .1; }
+
+static inline float leaky_gradient(float x) { return (x > 0) ? 1 : .1; }
+
+static inline float tanh_gradient(float x) { return 1 - x * x; }
+
+static inline float plse_gradient(float x) { return (x < 0 || x > 1) ? .01 : .125; }

 #endif

--- a/src/avgpool_layer.c
+++ b/src/avgpool_layer.c
@ -1,71 +0,0 @@
-#include "avgpool_layer.h"
-#include "cuda.h"
-#include <stdio.h>
-
-avgpool_layer make_avgpool_layer(int batch, int w, int h, int c)
-{
-    fprintf(stderr, "avg                     %4d x%4d x%4d   ->  %4d\n",  w, h, c, c);
-    avgpool_layer l = {0};
-    l.type = AVGPOOL;
-    l.batch = batch;
-    l.h = h;
-    l.w = w;
-    l.c = c;
-    l.out_w = 1;
-    l.out_h = 1;
-    l.out_c = c;
-    l.outputs = l.out_c;
-    l.inputs = h*w*c;
-    int output_size = l.outputs * batch;
-    l.output =  calloc(output_size, sizeof(float));
-    l.delta =   calloc(output_size, sizeof(float));
-    l.forward = forward_avgpool_layer;
-    l.backward = backward_avgpool_layer;
-    #ifdef GPU
-    l.forward_gpu = forward_avgpool_layer_gpu;
-    l.backward_gpu = backward_avgpool_layer_gpu;
-    l.output_gpu  = cuda_make_array(l.output, output_size);
-    l.delta_gpu   = cuda_make_array(l.delta, output_size);
-    #endif
-    return l;
-}
-
-void resize_avgpool_layer(avgpool_layer *l, int w, int h)
-{
-    l->w = w;
-    l->h = h;
-    l->inputs = h*w*l->c;
-}
-
-void forward_avgpool_layer(const avgpool_layer l, network net)
-{
-    int b,i,k;
-
-    for(b = 0; b < l.batch; ++b){
-        for(k = 0; k < l.c; ++k){
-            int out_index = k + b*l.c;
-            l.output[out_index] = 0;
-            for(i = 0; i < l.h*l.w; ++i){
-                int in_index = i + l.h*l.w*(k + b*l.c);
-                l.output[out_index] += net.input[in_index];
-            }
-            l.output[out_index] /= l.h*l.w;
-        }
-    }
-}
-
-void backward_avgpool_layer(const avgpool_layer l, network net)
-{
-    int b,i,k;
-
-    for(b = 0; b < l.batch; ++b){
-        for(k = 0; k < l.c; ++k){
-            int out_index = k + b*l.c;
-            for(i = 0; i < l.h*l.w; ++i){
-                int in_index = i + l.h*l.w*(k + b*l.c);
-                net.delta[in_index] += l.delta[out_index] / (l.h*l.w);
-            }
-        }
-    }
-}
-
--- a/src/avgpool_layer.cpp
+++ b/src/avgpool_layer.cpp
@ -0,0 +1,70 @@
+#include "avgpool_layer.h"
+#include <stdio.h>
+
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+avgpool_layer make_avgpool_layer(int batch, int w, int h, int c) {
+    fprintf(stderr, "avg                     %4d x%4d x%4d   ->  %4d\n", w, h, c, c);
+    avgpool_layer l = {(LAYER_TYPE)0};
+    l.type = AVGPOOL;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.out_w = 1;
+    l.out_h = 1;
+    l.out_c = c;
+    l.outputs = l.out_c;
+    l.inputs = h * w * c;
+    int output_size = l.outputs * batch;
+    l.output = (float*) calloc(output_size, sizeof(float));
+    l.delta = (float*) calloc(output_size, sizeof(float));
+    l.forward = forward_avgpool_layer;
+    l.backward = backward_avgpool_layer;
+#ifdef GPU
+    l.forward_gpu = forward_avgpool_layer_gpu;
+    l.backward_gpu = backward_avgpool_layer_gpu;
+    l.output_gpu  = cuda_make_array(l.output, output_size);
+    l.delta_gpu   = cuda_make_array(l.delta, output_size);
+#endif
+    return l;
+}
+
+void resize_avgpool_layer(avgpool_layer *l, int w, int h) {
+    l->w = w;
+    l->h = h;
+    l->inputs = h * w * l->c;
+}
+
+void forward_avgpool_layer(const avgpool_layer l, network net) {
+    int b, i, k;
+
+    for (b = 0; b < l.batch; ++b) {
+        for (k = 0; k < l.c; ++k) {
+            int out_index = k + b * l.c;
+            l.output[out_index] = 0;
+            for (i = 0; i < l.h * l.w; ++i) {
+                int in_index = i + l.h * l.w * (k + b * l.c);
+                l.output[out_index] += net.input[in_index];
+            }
+            l.output[out_index] /= l.h * l.w;
+        }
+    }
+}
+
+void backward_avgpool_layer(const avgpool_layer l, network net) {
+    int b, i, k;
+
+    for (b = 0; b < l.batch; ++b) {
+        for (k = 0; k < l.c; ++k) {
+            int out_index = k + b * l.c;
+            for (i = 0; i < l.h * l.w; ++i) {
+                int in_index = i + l.h * l.w * (k + b * l.c);
+                net.delta[in_index] += l.delta[out_index] / (l.h * l.w);
+            }
+        }
+    }
+}
+
--- a/src/avgpool_layer.h
+++ b/src/avgpool_layer.h
@ -2,7 +2,6 @@
 #define AVGPOOL_LAYER_H

 #include "image.h"
-#include "cuda.h"
 #include "layer.h"
 #include "network.h"

@ -15,6 +14,7 @@ void forward_avgpool_layer(const avgpool_layer l, network net);
 void backward_avgpool_layer(const avgpool_layer l, network net);

 #ifdef GPU
+#include "hip/hip_runtime.h"
 void forward_avgpool_layer_gpu(avgpool_layer l, network net);
 void backward_avgpool_layer_gpu(avgpool_layer l, network net);
 #endif
--- a/src/avgpool_layer_kernels.cu
+++ b/src/avgpool_layer_kernels.cu
@ -1,61 +1,59 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
+#include "hiprand.h"
+#include "hipblas.h"

-extern "C" {
 #include "avgpool_layer.h"
-#include "cuda.h"
-}

-__global__ void forward_avgpool_layer_kernel(int n, int w, int h, int c, float *input, float *output)
-{
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(id >= n) return;
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+__global__ void forward_avgpool_layer_kernel(int n, int w, int h, int c, float *input, float *output) {
+    int id = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;

    int k = id % c;
    id /= c;
    int b = id;

    int i;
-    int out_index = (k + c*b);
+    int out_index = (k + c * b);
    output[out_index] = 0;
-    for(i = 0; i < w*h; ++i){
-        int in_index = i + h*w*(k + b*c);
+    for (i = 0; i < w * h; ++i) {
+        int in_index = i + h * w * (k + b * c);
        output[out_index] += input[in_index];
    }
-    output[out_index] /= w*h;
+    output[out_index] /= w * h;
 }

-__global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float *in_delta, float *out_delta)
-{
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(id >= n) return;
+__global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float *in_delta, float *out_delta) {
+    int id = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;

    int k = id % c;
    id /= c;
    int b = id;

    int i;
-    int out_index = (k + c*b);
-    for(i = 0; i < w*h; ++i){
-        int in_index = i + h*w*(k + b*c);
-        in_delta[in_index] += out_delta[out_index] / (w*h);
+    int out_index = (k + c * b);
+    for (i = 0; i < w * h; ++i) {
+        int in_index = i + h * w * (k + b * c);
+        in_delta[in_index] += out_delta[out_index] / (w * h);
    }
 }

-extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network net)
-{
-    size_t n = layer.c*layer.batch;
+void forward_avgpool_layer_gpu(avgpool_layer layer, network net) {
+    size_t n = layer.c * layer.batch;

-    forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.input_gpu, layer.output_gpu);
-    check_error(cudaPeekAtLastError());
+    forward_avgpool_layer_kernel<<<cuda_gridsize(
+            n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.input_gpu, layer.output_gpu);
+    check_error(hipPeekAtLastError());
 }

-extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network net)
-{
-    size_t n = layer.c*layer.batch;
+void backward_avgpool_layer_gpu(avgpool_layer layer, network net) {
+    size_t n = layer.c * layer.batch;

-    backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.delta_gpu, layer.delta_gpu);
-    check_error(cudaPeekAtLastError());
+    backward_avgpool_layer_kernel<<<cuda_gridsize(
+            n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.delta_gpu, layer.delta_gpu);
+    check_error(hipPeekAtLastError());
 }

--- a/src/batchnorm_layer.cpp
+++ b/src/batchnorm_layer.cpp
@ -3,34 +3,33 @@
 #include "blas.h"
 #include <stdio.h>

-layer make_batchnorm_layer(int batch, int w, int h, int c)
-{
-    fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
-    layer l = {0};
+layer make_batchnorm_layer(int batch, int w, int h, int c) {
+    fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w, h, c);
+    layer l = {(LAYER_TYPE)0};
    l.type = BATCHNORM;
    l.batch = batch;
    l.h = l.out_h = h;
    l.w = l.out_w = w;
    l.c = l.out_c = c;
-    l.output = calloc(h * w * c * batch, sizeof(float));
-    l.delta  = calloc(h * w * c * batch, sizeof(float));
-    l.inputs = w*h*c;
+    l.output = (float *) calloc(h * w * c * batch, sizeof(float));
+    l.delta = (float *) calloc(h * w * c * batch, sizeof(float));
+    l.inputs = w * h * c;
    l.outputs = l.inputs;

-    l.scales = calloc(c, sizeof(float));
-    l.scale_updates = calloc(c, sizeof(float));
-    l.biases = calloc(c, sizeof(float));
-    l.bias_updates = calloc(c, sizeof(float));
+    l.scales = (float *) calloc(c, sizeof(float));
+    l.scale_updates = (float *) calloc(c, sizeof(float));
+    l.biases = (float *) calloc(c, sizeof(float));
+    l.bias_updates = (float *) calloc(c, sizeof(float));
    int i;
-    for(i = 0; i < c; ++i){
+    for (i = 0; i < c; ++i) {
        l.scales[i] = 1;
    }

-    l.mean = calloc(c, sizeof(float));
-    l.variance = calloc(c, sizeof(float));
+    l.mean = (float *) calloc(c, sizeof(float));
+    l.variance = (float *) calloc(c, sizeof(float));

-    l.rolling_mean = calloc(c, sizeof(float));
-    l.rolling_variance = calloc(c, sizeof(float));
+    l.rolling_mean = (float *) calloc(c, sizeof(float));
+    l.rolling_variance = (float *) calloc(c, sizeof(float));

    l.forward = forward_batchnorm_layer;
    l.backward = backward_batchnorm_layer;
@ -58,25 +57,24 @@ layer make_batchnorm_layer(int batch, int w, int h, int c)

    l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
    l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
-    #ifdef CUDNN
-    cudnnCreateTensorDescriptor(&l.normTensorDesc);
-    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
-    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
-    cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
+#ifdef CUDNN
+    hipdnnCreateTensorDescriptor(&l.normTensorDesc);
+    hipdnnCreateTensorDescriptor(&l.dstTensorDesc);
+    hipdnnSetTensor4dDescriptor(l.dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w);
+    hipdnnSetTensor4dDescriptor(l.normTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, 1, l.out_c, 1, 1);

-    #endif
+#endif
 #endif
    return l;
 }

-void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
-{
-    int i,b,f;
-    for(f = 0; f < n; ++f){
+void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates) {
+    int i, b, f;
+    for (f = 0; f < n; ++f) {
        float sum = 0;
-        for(b = 0; b < batch; ++b){
-            for(i = 0; i < size; ++i){
-                int index = i + size*(f + n*b);
+        for (b = 0; b < batch; ++b) {
+            for (i = 0; i < size; ++i) {
+                int index = i + size * (f + n * b);
                sum += delta[index] * x_norm[index];
            }
        }
@ -84,91 +82,92 @@ void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size,
    }
 }

-void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
-{
+void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) {

-    int i,j,k;
-    for(i = 0; i < filters; ++i){
+    int i, j, k;
+    for (i = 0; i < filters; ++i) {
        mean_delta[i] = 0;
        for (j = 0; j < batch; ++j) {
            for (k = 0; k < spatial; ++k) {
-                int index = j*filters*spatial + i*spatial + k;
+                int index = j * filters * spatial + i * spatial + k;
                mean_delta[i] += delta[index];
            }
        }
-        mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
+        mean_delta[i] *= (-1. / sqrt(variance[i] + .00001f));
    }
 }
-void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
-{

-    int i,j,k;
-    for(i = 0; i < filters; ++i){
+void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial,
+                        float *variance_delta) {
+
+    int i, j, k;
+    for (i = 0; i < filters; ++i) {
        variance_delta[i] = 0;
-        for(j = 0; j < batch; ++j){
-            for(k = 0; k < spatial; ++k){
-                int index = j*filters*spatial + i*spatial + k;
-                variance_delta[i] += delta[index]*(x[index] - mean[i]);
+        for (j = 0; j < batch; ++j) {
+            for (k = 0; k < spatial; ++k) {
+                int index = j * filters * spatial + i * spatial + k;
+                variance_delta[i] += delta[index] * (x[index] - mean[i]);
            }
        }
-        variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
+        variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float) (-3. / 2.));
    }
 }
-void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
-{
+
+void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch,
+                         int filters, int spatial, float *delta) {
    int f, j, k;
-    for(j = 0; j < batch; ++j){
-        for(f = 0; f < filters; ++f){
-            for(k = 0; k < spatial; ++k){
-                int index = j*filters*spatial + f*spatial + k;
-                delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+    for (j = 0; j < batch; ++j) {
+        for (f = 0; f < filters; ++f) {
+            for (k = 0; k < spatial; ++k) {
+                int index = j * filters * spatial + f * spatial + k;
+                delta[index] = delta[index] * 1. / (sqrt(variance[f] + .00001f)) +
+                               variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) +
+                               mean_delta[f] / (spatial * batch);
            }
        }
    }
 }

-void resize_batchnorm_layer(layer *layer, int w, int h)
-{
+void resize_batchnorm_layer(layer *layer, int w, int h) {
    fprintf(stderr, "Not implemented\n");
 }

-void forward_batchnorm_layer(layer l, network net)
-{
-    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
-    copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
-    if(net.train){
-        mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
-        variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
+void forward_batchnorm_layer(layer l, network net) {
+    if (l.type == BATCHNORM) copy_cpu(l.outputs * l.batch, net.input, 1, l.output, 1);
+    copy_cpu(l.outputs * l.batch, l.output, 1, l.x, 1);
+    if (net.train) {
+        mean_cpu(l.output, l.batch, l.out_c, l.out_h * l.out_w, l.mean);
+        variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h * l.out_w, l.variance);

        scal_cpu(l.out_c, .99, l.rolling_mean, 1);
        axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
        scal_cpu(l.out_c, .99, l.rolling_variance, 1);
        axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);

-        normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);   
-        copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
+        normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h * l.out_w);
+        copy_cpu(l.outputs * l.batch, l.output, 1, l.x_norm, 1);
    } else {
-        normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
+        normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h * l.out_w);
    }
-    scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
-    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
+    scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h * l.out_w);
+    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h * l.out_w);
 }

-void backward_batchnorm_layer(layer l, network net)
-{
-    if(!net.train){
+void backward_batchnorm_layer(layer l, network net) {
+    if (!net.train) {
        l.mean = l.rolling_mean;
        l.variance = l.rolling_variance;
    }
-    backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
-    backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
+    backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w * l.out_h);
+    backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w * l.out_h, l.scale_updates);

-    scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
+    scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h * l.out_w);

-    mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
-    variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
-    normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
-    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
+    mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w * l.out_h, l.mean_delta);
+    variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w * l.out_h, l.variance_delta);
+    normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w * l.out_h,
+                        l.delta);
+    if (l.type == BATCHNORM) copy_cpu(l.outputs * l.batch, l.delta, 1, net.delta, 1);
 }

 #ifdef GPU
@ -194,8 +193,8 @@ void forward_batchnorm_layer_gpu(layer l, network net)
 #ifdef CUDNN
        float one = 1;
        float zero = 0;
-        cudnnBatchNormalizationForwardTraining(cudnn_handle(),
-                CUDNN_BATCHNORM_SPATIAL,
+        hipdnnBatchNormalizationForwardTraining(cudnn_handle(),
+                HIPDNN_BATCHNORM_SPATIAL,
                &one,
                &zero,
                l.dstTensorDesc,
@ -244,8 +243,8 @@ void backward_batchnorm_layer_gpu(layer l, network net)
 #ifdef CUDNN
    float one = 1;
    float zero = 0;
-    cudnnBatchNormalizationBackward(cudnn_handle(),
-            CUDNN_BATCHNORM_SPATIAL,
+    hipdnnBatchNormalizationBackward(cudnn_handle(),
+            HIPDNN_BATCHNORM_SPATIAL,
            &one,
            &zero,
            &one,
--- a/src/blas.c
+++ b/src/blas.c
@ -1,351 +0,0 @@
-#include "blas.h"
-
-#include <math.h>
-#include <assert.h>
-#include <float.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
-{
-    int b,i,j,k;
-    int out_c = c/(stride*stride);
-
-    for(b = 0; b < batch; ++b){
-        for(k = 0; k < c; ++k){
-            for(j = 0; j < h; ++j){
-                for(i = 0; i < w; ++i){
-                    int in_index  = i + w*(j + h*(k + c*b));
-                    int c2 = k % out_c;
-                    int offset = k / out_c;
-                    int w2 = i*stride + offset % stride;
-                    int h2 = j*stride + offset / stride;
-                    int out_index = w2 + w*stride*(h2 + h*stride*(c2 + out_c*b));
-                    if(forward) out[out_index] = x[in_index];
-                    else out[in_index] = x[out_index];
-                }
-            }
-        }
-    }
-}
-
-void flatten(float *x, int size, int layers, int batch, int forward)
-{
-    float *swap = calloc(size*layers*batch, sizeof(float));
-    int i,c,b;
-    for(b = 0; b < batch; ++b){
-        for(c = 0; c < layers; ++c){
-            for(i = 0; i < size; ++i){
-                int i1 = b*layers*size + c*size + i;
-                int i2 = b*layers*size + i*layers + c;
-                if (forward) swap[i2] = x[i1];
-                else swap[i1] = x[i2];
-            }
-        }
-    }
-    memcpy(x, swap, size*layers*batch*sizeof(float));
-    free(swap);
-}
-
-void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
-{
-    int i;
-    for(i = 0; i < n; ++i){
-        c[i] = s[i]*a[i] + (1-s[i])*(b ? b[i] : 0);
-    }
-}
-
-void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc)
-{
-    int i;
-    for(i = 0; i < n; ++i){
-        if(da) da[i] += dc[i] * s[i];
-        if(db) db[i] += dc[i] * (1-s[i]);
-        ds[i] += dc[i] * (a[i] - b[i]);
-    }
-}
-
-void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
-{
-    int stride = w1/w2;
-    int sample = w2/w1;
-    assert(stride == h1/h2);
-    assert(sample == h2/h1);
-    if(stride < 1) stride = 1;
-    if(sample < 1) sample = 1;
-    int minw = (w1 < w2) ? w1 : w2;
-    int minh = (h1 < h2) ? h1 : h2;
-    int minc = (c1 < c2) ? c1 : c2;
-
-    int i,j,k,b;
-    for(b = 0; b < batch; ++b){
-        for(k = 0; k < minc; ++k){
-            for(j = 0; j < minh; ++j){
-                for(i = 0; i < minw; ++i){
-                    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
-                    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
-                    out[out_index] = s1*out[out_index] + s2*add[add_index];
-                }
-            }
-        }
-    }
-}
-
-void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
-{
-    float scale = 1./(batch * spatial);
-    int i,j,k;
-    for(i = 0; i < filters; ++i){
-        mean[i] = 0;
-        for(j = 0; j < batch; ++j){
-            for(k = 0; k < spatial; ++k){
-                int index = j*filters*spatial + i*spatial + k;
-                mean[i] += x[index];
-            }
-        }
-        mean[i] *= scale;
-    }
-}
-
-void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
-{
-    float scale = 1./(batch * spatial - 1);
-    int i,j,k;
-    for(i = 0; i < filters; ++i){
-        variance[i] = 0;
-        for(j = 0; j < batch; ++j){
-            for(k = 0; k < spatial; ++k){
-                int index = j*filters*spatial + i*spatial + k;
-                variance[i] += pow((x[index] - mean[i]), 2);
-            }
-        }
-        variance[i] *= scale;
-    }
-}
-
-void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial)
-{
-    int b,f,i;
-    for(b = 0; b < batch; ++b){
-        for(i = 0; i < spatial; ++i){
-            float sum = 0;
-            for(f = 0; f < filters; ++f){
-                int index = b*filters*spatial + f*spatial + i;
-                sum += powf(x[index], 2);
-            }
-            sum = sqrtf(sum);
-            for(f = 0; f < filters; ++f){
-                int index = b*filters*spatial + f*spatial + i;
-                x[index] /= sum;
-                dx[index] = (1 - x[index]) / sum;
-            }
-        }
-    }
-}
-
-
-void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
-{
-    int b, f, i;
-    for(b = 0; b < batch; ++b){
-        for(f = 0; f < filters; ++f){
-            for(i = 0; i < spatial; ++i){
-                int index = b*filters*spatial + f*spatial + i;
-                x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
-            }
-        }
-    }
-}
-
-void const_cpu(int N, float ALPHA, float *X, int INCX)
-{
-    int i;
-    for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
-}
-
-void mul_cpu(int N, float *X, int INCX, float *Y, int INCY)
-{
-    int i;
-    for(i = 0; i < N; ++i) Y[i*INCY] *= X[i*INCX];
-}
-
-void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
-{
-    int i;
-    for(i = 0; i < N; ++i) Y[i*INCY] = pow(X[i*INCX], ALPHA);
-}
-
-void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
-{
-    int i;
-    for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
-}
-
-void scal_cpu(int N, float ALPHA, float *X, int INCX)
-{
-    int i;
-    for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
-}
-
-void fill_cpu(int N, float ALPHA, float *X, int INCX)
-{
-    int i;
-    for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
-}
-
-void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
-{
-    int i, j;
-    int index = 0;
-    for(j = 0; j < B; ++j) {
-        for(i = 0; i < NX; ++i){
-            if(X) X[j*NX + i] += OUT[index];
-            ++index;
-        }
-        for(i = 0; i < NY; ++i){
-            if(Y) Y[j*NY + i] += OUT[index];
-            ++index;
-        }
-    }
-}
-
-void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
-{
-    int i, j;
-    int index = 0;
-    for(j = 0; j < B; ++j) {
-        for(i = 0; i < NX; ++i){
-            OUT[index++] = X[j*NX + i];
-        }
-        for(i = 0; i < NY; ++i){
-            OUT[index++] = Y[j*NY + i];
-        }
-    }
-}
-
-void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
-{
-    int i;
-    for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
-}
-
-void mult_add_into_cpu(int N, float *X, float *Y, float *Z)
-{
-    int i;
-    for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i];
-}
-
-void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
-{
-    int i;
-    for(i = 0; i < n; ++i){
-        float diff = truth[i] - pred[i];
-        float abs_val = fabs(diff);
-        if(abs_val < 1) {
-            error[i] = diff * diff;
-            delta[i] = diff;
-        }
-        else {
-            error[i] = 2*abs_val - 1;
-            delta[i] = (diff < 0) ? 1 : -1;
-        }
-    }
-}
-
-void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
-{
-    int i;
-    for(i = 0; i < n; ++i){
-        float diff = truth[i] - pred[i];
-        error[i] = fabs(diff);
-        delta[i] = diff > 0 ? 1 : -1;
-    }
-}
-
-void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
-{
-    int i;
-    for(i = 0; i < n; ++i){
-        float t = truth[i];
-        float p = pred[i];
-        error[i] = (t) ? -log(p) : 0;
-        delta[i] = t-p;
-    }
-}
-
-void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
-{
-    int i;
-    for(i = 0; i < n; ++i){
-        float t = truth[i];
-        float p = pred[i];
-        error[i] = -t*log(p) - (1-t)*log(1-p);
-        delta[i] = t-p;
-    }
-}
-
-void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
-{
-    int i;
-    for(i = 0; i < n; ++i){
-        float diff = truth[i] - pred[i];
-        error[i] = diff * diff;
-        delta[i] = diff;
-    }
-}
-
-float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
-{
-    int i;
-    float dot = 0;
-    for(i = 0; i < N; ++i) dot += X[i*INCX] * Y[i*INCY];
-    return dot;
-}
-
-void softmax(float *input, int n, float temp, int stride, float *output)
-{
-    int i;
-    float sum = 0;
-    float largest = -FLT_MAX;
-    for(i = 0; i < n; ++i){
-        if(input[i*stride] > largest) largest = input[i*stride];
-    }
-    for(i = 0; i < n; ++i){
-        float e = exp(input[i*stride]/temp - largest/temp);
-        sum += e;
-        output[i*stride] = e;
-    }
-    for(i = 0; i < n; ++i){
-        output[i*stride] /= sum;
-    }
-}
-
-
-void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
-{
-    int g, b;
-    for(b = 0; b < batch; ++b){
-        for(g = 0; g < groups; ++g){
-            softmax(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
-        }
-    }
-}
-
-void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
-{
-    int i, j, k, b;
-    for(b = 0; b < batch; ++b){
-        for(k = 0; k < c; ++k){
-            for(j = 0; j < h*stride; ++j){
-                for(i = 0; i < w*stride; ++i){
-                    int in_index = b*w*h*c + k*w*h + (j/stride)*w + i/stride;
-                    int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
-                    if(forward) out[out_index] = scale*in[in_index];
-                    else in[in_index] += scale*out[out_index];
-                }
-            }
-        }
-    }
-}
-
-
--- a/src/blas.cpp
+++ b/src/blas.cpp
@ -0,0 +1,326 @@
+#include "blas.h"
+
+#include <math.h>
+#include <assert.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out) {
+    int b, i, j, k;
+    int out_c = c / (stride * stride);
+
+    for (b = 0; b < batch; ++b) {
+        for (k = 0; k < c; ++k) {
+            for (j = 0; j < h; ++j) {
+                for (i = 0; i < w; ++i) {
+                    int in_index = i + w * (j + h * (k + c * b));
+                    int c2 = k % out_c;
+                    int offset = k / out_c;
+                    int w2 = i * stride + offset % stride;
+                    int h2 = j * stride + offset / stride;
+                    int out_index = w2 + w * stride * (h2 + h * stride * (c2 + out_c * b));
+                    if (forward) out[out_index] = x[in_index];
+                    else out[in_index] = x[out_index];
+                }
+            }
+        }
+    }
+}
+
+void flatten(float *x, int size, int layers, int batch, int forward) {
+    float *swap = (float *) calloc(size * layers * batch, sizeof(float));
+    int i, c, b;
+    for (b = 0; b < batch; ++b) {
+        for (c = 0; c < layers; ++c) {
+            for (i = 0; i < size; ++i) {
+                int i1 = b * layers * size + c * size + i;
+                int i2 = b * layers * size + i * layers + c;
+                if (forward) swap[i2] = x[i1];
+                else swap[i1] = x[i2];
+            }
+        }
+    }
+    memcpy(x, swap, size * layers * batch * sizeof(float));
+    free(swap);
+}
+
+void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c) {
+    int i;
+    for (i = 0; i < n; ++i) {
+        c[i] = s[i] * a[i] + (1 - s[i]) * (b ? b[i] : 0);
+    }
+}
+
+void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc) {
+    int i;
+    for (i = 0; i < n; ++i) {
+        if (da) da[i] += dc[i] * s[i];
+        if (db) db[i] += dc[i] * (1 - s[i]);
+        ds[i] += dc[i] * (a[i] - b[i]);
+    }
+}
+
+void
+shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out) {
+    int stride = w1 / w2;
+    int sample = w2 / w1;
+    assert(stride == h1 / h2);
+    assert(sample == h2 / h1);
+    if (stride < 1) stride = 1;
+    if (sample < 1) sample = 1;
+    int minw = (w1 < w2) ? w1 : w2;
+    int minh = (h1 < h2) ? h1 : h2;
+    int minc = (c1 < c2) ? c1 : c2;
+
+    int i, j, k, b;
+    for (b = 0; b < batch; ++b) {
+        for (k = 0; k < minc; ++k) {
+            for (j = 0; j < minh; ++j) {
+                for (i = 0; i < minw; ++i) {
+                    int out_index = i * sample + w2 * (j * sample + h2 * (k + c2 * b));
+                    int add_index = i * stride + w1 * (j * stride + h1 * (k + c1 * b));
+                    out[out_index] = s1 * out[out_index] + s2 * add[add_index];
+                }
+            }
+        }
+    }
+}
+
+void mean_cpu(float *x, int batch, int filters, int spatial, float *mean) {
+    float scale = 1. / (batch * spatial);
+    int i, j, k;
+    for (i = 0; i < filters; ++i) {
+        mean[i] = 0;
+        for (j = 0; j < batch; ++j) {
+            for (k = 0; k < spatial; ++k) {
+                int index = j * filters * spatial + i * spatial + k;
+                mean[i] += x[index];
+            }
+        }
+        mean[i] *= scale;
+    }
+}
+
+void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) {
+    float scale = 1. / (batch * spatial - 1);
+    int i, j, k;
+    for (i = 0; i < filters; ++i) {
+        variance[i] = 0;
+        for (j = 0; j < batch; ++j) {
+            for (k = 0; k < spatial; ++k) {
+                int index = j * filters * spatial + i * spatial + k;
+                variance[i] += pow((x[index] - mean[i]), 2);
+            }
+        }
+        variance[i] *= scale;
+    }
+}
+
+void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial) {
+    int b, f, i;
+    for (b = 0; b < batch; ++b) {
+        for (i = 0; i < spatial; ++i) {
+            float sum = 0;
+            for (f = 0; f < filters; ++f) {
+                int index = b * filters * spatial + f * spatial + i;
+                sum += powf(x[index], 2);
+            }
+            sum = sqrtf(sum);
+            for (f = 0; f < filters; ++f) {
+                int index = b * filters * spatial + f * spatial + i;
+                x[index] /= sum;
+                dx[index] = (1 - x[index]) / sum;
+            }
+        }
+    }
+}
+
+
+void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial) {
+    int b, f, i;
+    for (b = 0; b < batch; ++b) {
+        for (f = 0; f < filters; ++f) {
+            for (i = 0; i < spatial; ++i) {
+                int index = b * filters * spatial + f * spatial + i;
+                x[index] = (x[index] - mean[f]) / (sqrt(variance[f]) + .000001f);
+            }
+        }
+    }
+}
+
+void const_cpu(int N, float ALPHA, float *X, int INCX) {
+    int i;
+    for (i = 0; i < N; ++i) X[i * INCX] = ALPHA;
+}
+
+void mul_cpu(int N, float *X, int INCX, float *Y, int INCY) {
+    int i;
+    for (i = 0; i < N; ++i) Y[i * INCY] *= X[i * INCX];
+}
+
+void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) {
+    int i;
+    for (i = 0; i < N; ++i) Y[i * INCY] = pow(X[i * INCX], ALPHA);
+}
+
+void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) {
+    int i;
+    for (i = 0; i < N; ++i) Y[i * INCY] += ALPHA * X[i * INCX];
+}
+
+void scal_cpu(int N, float ALPHA, float *X, int INCX) {
+    int i;
+    for (i = 0; i < N; ++i) X[i * INCX] *= ALPHA;
+}
+
+void fill_cpu(int N, float ALPHA, float *X, int INCX) {
+    int i;
+    for (i = 0; i < N; ++i) X[i * INCX] = ALPHA;
+}
+
+void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT) {
+    int i, j;
+    int index = 0;
+    for (j = 0; j < B; ++j) {
+        for (i = 0; i < NX; ++i) {
+            if (X) X[j * NX + i] += OUT[index];
+            ++index;
+        }
+        for (i = 0; i < NY; ++i) {
+            if (Y) Y[j * NY + i] += OUT[index];
+            ++index;
+        }
+    }
+}
+
+void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT) {
+    int i, j;
+    int index = 0;
+    for (j = 0; j < B; ++j) {
+        for (i = 0; i < NX; ++i) {
+            OUT[index++] = X[j * NX + i];
+        }
+        for (i = 0; i < NY; ++i) {
+            OUT[index++] = Y[j * NY + i];
+        }
+    }
+}
+
+void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) {
+    int i;
+    for (i = 0; i < N; ++i) Y[i * INCY] = X[i * INCX];
+}
+
+void mult_add_into_cpu(int N, float *X, float *Y, float *Z) {
+    int i;
+    for (i = 0; i < N; ++i) Z[i] += X[i] * Y[i];
+}
+
+void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error) {
+    int i;
+    for (i = 0; i < n; ++i) {
+        float diff = truth[i] - pred[i];
+        float abs_val = fabs(diff);
+        if (abs_val < 1) {
+            error[i] = diff * diff;
+            delta[i] = diff;
+        } else {
+            error[i] = 2 * abs_val - 1;
+            delta[i] = (diff < 0) ? 1 : -1;
+        }
+    }
+}
+
+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error) {
+    int i;
+    for (i = 0; i < n; ++i) {
+        float diff = truth[i] - pred[i];
+        error[i] = fabs(diff);
+        delta[i] = diff > 0 ? 1 : -1;
+    }
+}
+
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error) {
+    int i;
+    for (i = 0; i < n; ++i) {
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t - p;
+    }
+}
+
+void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error) {
+    int i;
+    for (i = 0; i < n; ++i) {
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = -t * log(p) - (1 - t) * log(1 - p);
+        delta[i] = t - p;
+    }
+}
+
+void l2_cpu(int n, float *pred, float *truth, float *delta, float *error) {
+    int i;
+    for (i = 0; i < n; ++i) {
+        float diff = truth[i] - pred[i];
+        error[i] = diff * diff;
+        delta[i] = diff;
+    }
+}
+
+float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) {
+    int i;
+    float dot = 0;
+    for (i = 0; i < N; ++i) dot += X[i * INCX] * Y[i * INCY];
+    return dot;
+}
+
+void softmax(float *input, int n, float temp, int stride, float *output) {
+    int i;
+    float sum = 0;
+    float largest = -FLT_MAX;
+    for (i = 0; i < n; ++i) {
+        if (input[i * stride] > largest) largest = input[i * stride];
+    }
+    for (i = 0; i < n; ++i) {
+        float e = exp(input[i * stride] / temp - largest / temp);
+        sum += e;
+        output[i * stride] = e;
+    }
+    for (i = 0; i < n; ++i) {
+        output[i * stride] /= sum;
+    }
+}
+
+
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp,
+                 float *output) {
+    int g, b;
+    for (b = 0; b < batch; ++b) {
+        for (g = 0; g < groups; ++g) {
+            softmax(input + b * batch_offset + g * group_offset, n, temp, stride,
+                    output + b * batch_offset + g * group_offset);
+        }
+    }
+}
+
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out) {
+    int i, j, k, b;
+    for (b = 0; b < batch; ++b) {
+        for (k = 0; k < c; ++k) {
+            for (j = 0; j < h * stride; ++j) {
+                for (i = 0; i < w * stride; ++i) {
+                    int in_index = b * w * h * c + k * w * h + (j / stride) * w + i / stride;
+                    int out_index = b * w * h * c * stride * stride + k * w * h * stride * stride + j * w * stride + i;
+                    if (forward) out[out_index] = scale * in[in_index];
+                    else in[in_index] += scale * out[out_index];
+                }
+            }
+        }
+    }
+}
+
+
--- a/src/blas.h
+++ b/src/blas.h
@ -1,51 +1,80 @@
 #ifndef BLAS_H
 #define BLAS_H
+
 #include "darknet.h"

 void flatten(float *x, int size, int layers, int batch, int forward);
+
 void pm(int M, int N, float *A);
+
 float *random_matrix(int rows, int cols);
+
 void time_random_matrix(int TA, int TB, int m, int k, int n);
+
 void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);

 void test_blas();

 void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
+
 void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
+
 void mult_add_into_cpu(int N, float *X, float *Y, float *Z);

 void const_cpu(int N, float ALPHA, float *X, int INCX);
-void constrain_gpu(int N, float ALPHA, float * X, int INCX);
+
+void constrain_gpu(int N, float ALPHA, float *X, int INCX);
+
 void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+
 void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);

 int test_gpu_blas();
-void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);
+
+void
+shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);

 void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
+
 void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);

 void scale_bias(float *output, float *scales, int batch, int n, int size);
+
 void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
+
 void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
-void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);
-void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
+
+void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial,
+                        float *variance_delta);
+
+void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch,
+                         int filters, int spatial, float *delta);
+
 void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial);

 void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
+
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
+
 void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
+
 void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
+
 void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
+
 void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
+
 void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc);

 void softmax(float *input, int n, float temp, int stride, float *output);
-void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
+
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp,
+                 float *output);
+
 void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);

 #ifdef GPU
-#include "cuda.h"
+#include "hip/hip_runtime.h"
 #include "tree.h"

 void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
--- a/src/box.c
+++ b/src/box.c
@ -1,357 +0,0 @@
-#include "box.h"
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-
-int nms_comparator(const void *pa, const void *pb)
-{
-    detection a = *(detection *)pa;
-    detection b = *(detection *)pb;
-    float diff = 0;
-    if(b.sort_class >= 0){
-        diff = a.prob[b.sort_class] - b.prob[b.sort_class];
-    } else {
-        diff = a.objectness - b.objectness;
-    }
-    if(diff < 0) return 1;
-    else if(diff > 0) return -1;
-    return 0;
-}
-
-void do_nms_obj(detection *dets, int total, int classes, float thresh)
-{
-    int i, j, k;
-    k = total-1;
-    for(i = 0; i <= k; ++i){
-        if(dets[i].objectness == 0){
-            detection swap = dets[i];
-            dets[i] = dets[k];
-            dets[k] = swap;
-            --k;
-            --i;
-        }
-    }
-    total = k+1;
-
-    for(i = 0; i < total; ++i){
-        dets[i].sort_class = -1;
-    }
-
-    qsort(dets, total, sizeof(detection), nms_comparator);
-    for(i = 0; i < total; ++i){
-        if(dets[i].objectness == 0) continue;
-        box a = dets[i].bbox;
-        for(j = i+1; j < total; ++j){
-            if(dets[j].objectness == 0) continue;
-            box b = dets[j].bbox;
-            if (box_iou(a, b) > thresh){
-                dets[j].objectness = 0;
-                for(k = 0; k < classes; ++k){
-                    dets[j].prob[k] = 0;
-                }
-            }
-        }
-    }
-}
-
-
-void do_nms_sort(detection *dets, int total, int classes, float thresh)
-{
-    int i, j, k;
-    k = total-1;
-    for(i = 0; i <= k; ++i){
-        if(dets[i].objectness == 0){
-            detection swap = dets[i];
-            dets[i] = dets[k];
-            dets[k] = swap;
-            --k;
-            --i;
-        }
-    }
-    total = k+1;
-
-    for(k = 0; k < classes; ++k){
-        for(i = 0; i < total; ++i){
-            dets[i].sort_class = k;
-        }
-        qsort(dets, total, sizeof(detection), nms_comparator);
-        for(i = 0; i < total; ++i){
-            if(dets[i].prob[k] == 0) continue;
-            box a = dets[i].bbox;
-            for(j = i+1; j < total; ++j){
-                box b = dets[j].bbox;
-                if (box_iou(a, b) > thresh){
-                    dets[j].prob[k] = 0;
-                }
-            }
-        }
-    }
-}
-
-box float_to_box(float *f, int stride)
-{
-    box b = {0};
-    b.x = f[0];
-    b.y = f[1*stride];
-    b.w = f[2*stride];
-    b.h = f[3*stride];
-    return b;
-}
-
-dbox derivative(box a, box b)
-{
-    dbox d;
-    d.dx = 0;
-    d.dw = 0;
-    float l1 = a.x - a.w/2;
-    float l2 = b.x - b.w/2;
-    if (l1 > l2){
-        d.dx -= 1;
-        d.dw += .5;
-    }
-    float r1 = a.x + a.w/2;
-    float r2 = b.x + b.w/2;
-    if(r1 < r2){
-        d.dx += 1;
-        d.dw += .5;
-    }
-    if (l1 > r2) {
-        d.dx = -1;
-        d.dw = 0;
-    }
-    if (r1 < l2){
-        d.dx = 1;
-        d.dw = 0;
-    }
-
-    d.dy = 0;
-    d.dh = 0;
-    float t1 = a.y - a.h/2;
-    float t2 = b.y - b.h/2;
-    if (t1 > t2){
-        d.dy -= 1;
-        d.dh += .5;
-    }
-    float b1 = a.y + a.h/2;
-    float b2 = b.y + b.h/2;
-    if(b1 < b2){
-        d.dy += 1;
-        d.dh += .5;
-    }
-    if (t1 > b2) {
-        d.dy = -1;
-        d.dh = 0;
-    }
-    if (b1 < t2){
-        d.dy = 1;
-        d.dh = 0;
-    }
-    return d;
-}
-
-float overlap(float x1, float w1, float x2, float w2)
-{
-    float l1 = x1 - w1/2;
-    float l2 = x2 - w2/2;
-    float left = l1 > l2 ? l1 : l2;
-    float r1 = x1 + w1/2;
-    float r2 = x2 + w2/2;
-    float right = r1 < r2 ? r1 : r2;
-    return right - left;
-}
-
-float box_intersection(box a, box b)
-{
-    float w = overlap(a.x, a.w, b.x, b.w);
-    float h = overlap(a.y, a.h, b.y, b.h);
-    if(w < 0 || h < 0) return 0;
-    float area = w*h;
-    return area;
-}
-
-float box_union(box a, box b)
-{
-    float i = box_intersection(a, b);
-    float u = a.w*a.h + b.w*b.h - i;
-    return u;
-}
-
-float box_iou(box a, box b)
-{
-    return box_intersection(a, b)/box_union(a, b);
-}
-
-float box_rmse(box a, box b)
-{
-    return sqrt(pow(a.x-b.x, 2) + 
-                pow(a.y-b.y, 2) + 
-                pow(a.w-b.w, 2) + 
-                pow(a.h-b.h, 2));
-}
-
-dbox dintersect(box a, box b)
-{
-    float w = overlap(a.x, a.w, b.x, b.w);
-    float h = overlap(a.y, a.h, b.y, b.h);
-    dbox dover = derivative(a, b);
-    dbox di;
-
-    di.dw = dover.dw*h;
-    di.dx = dover.dx*h;
-    di.dh = dover.dh*w;
-    di.dy = dover.dy*w;
-
-    return di;
-}
-
-dbox dunion(box a, box b)
-{
-    dbox du;
-
-    dbox di = dintersect(a, b);
-    du.dw = a.h - di.dw;
-    du.dh = a.w - di.dh;
-    du.dx = -di.dx;
-    du.dy = -di.dy;
-
-    return du;
-}
-
-
-void test_dunion()
-{
-    box a = {0, 0, 1, 1};
-    box dxa= {0+.0001, 0, 1, 1};
-    box dya= {0, 0+.0001, 1, 1};
-    box dwa= {0, 0, 1+.0001, 1};
-    box dha= {0, 0, 1, 1+.0001};
-
-    box b = {.5, .5, .2, .2};
-    dbox di = dunion(a,b);
-    printf("Union: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh);
-    float inter =  box_union(a, b);
-    float xinter = box_union(dxa, b);
-    float yinter = box_union(dya, b);
-    float winter = box_union(dwa, b);
-    float hinter = box_union(dha, b);
-    xinter = (xinter - inter)/(.0001);
-    yinter = (yinter - inter)/(.0001);
-    winter = (winter - inter)/(.0001);
-    hinter = (hinter - inter)/(.0001);
-    printf("Union Manual %f %f %f %f\n", xinter, yinter, winter, hinter);
-}
-void test_dintersect()
-{
-    box a = {0, 0, 1, 1};
-    box dxa= {0+.0001, 0, 1, 1};
-    box dya= {0, 0+.0001, 1, 1};
-    box dwa= {0, 0, 1+.0001, 1};
-    box dha= {0, 0, 1, 1+.0001};
-
-    box b = {.5, .5, .2, .2};
-    dbox di = dintersect(a,b);
-    printf("Inter: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh);
-    float inter =  box_intersection(a, b);
-    float xinter = box_intersection(dxa, b);
-    float yinter = box_intersection(dya, b);
-    float winter = box_intersection(dwa, b);
-    float hinter = box_intersection(dha, b);
-    xinter = (xinter - inter)/(.0001);
-    yinter = (yinter - inter)/(.0001);
-    winter = (winter - inter)/(.0001);
-    hinter = (hinter - inter)/(.0001);
-    printf("Inter Manual %f %f %f %f\n", xinter, yinter, winter, hinter);
-}
-
-void test_box()
-{
-    test_dintersect();
-    test_dunion();
-    box a = {0, 0, 1, 1};
-    box dxa= {0+.00001, 0, 1, 1};
-    box dya= {0, 0+.00001, 1, 1};
-    box dwa= {0, 0, 1+.00001, 1};
-    box dha= {0, 0, 1, 1+.00001};
-
-    box b = {.5, 0, .2, .2};
-
-    float iou = box_iou(a,b);
-    iou = (1-iou)*(1-iou);
-    printf("%f\n", iou);
-    dbox d = diou(a, b);
-    printf("%f %f %f %f\n", d.dx, d.dy, d.dw, d.dh);
-
-    float xiou = box_iou(dxa, b);
-    float yiou = box_iou(dya, b);
-    float wiou = box_iou(dwa, b);
-    float hiou = box_iou(dha, b);
-    xiou = ((1-xiou)*(1-xiou) - iou)/(.00001);
-    yiou = ((1-yiou)*(1-yiou) - iou)/(.00001);
-    wiou = ((1-wiou)*(1-wiou) - iou)/(.00001);
-    hiou = ((1-hiou)*(1-hiou) - iou)/(.00001);
-    printf("manual %f %f %f %f\n", xiou, yiou, wiou, hiou);
-}
-
-dbox diou(box a, box b)
-{
-    float u = box_union(a,b);
-    float i = box_intersection(a,b);
-    dbox di = dintersect(a,b);
-    dbox du = dunion(a,b);
-    dbox dd = {0,0,0,0};
-
-    if(i <= 0 || 1) {
-        dd.dx = b.x - a.x;
-        dd.dy = b.y - a.y;
-        dd.dw = b.w - a.w;
-        dd.dh = b.h - a.h;
-        return dd;
-    }
-
-    dd.dx = 2*pow((1-(i/u)),1)*(di.dx*u - du.dx*i)/(u*u);
-    dd.dy = 2*pow((1-(i/u)),1)*(di.dy*u - du.dy*i)/(u*u);
-    dd.dw = 2*pow((1-(i/u)),1)*(di.dw*u - du.dw*i)/(u*u);
-    dd.dh = 2*pow((1-(i/u)),1)*(di.dh*u - du.dh*i)/(u*u);
-    return dd;
-}
-
-
-void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
-{
-    int i, j, k;
-    for(i = 0; i < total; ++i){
-        int any = 0;
-        for(k = 0; k < classes; ++k) any = any || (probs[i][k] > 0);
-        if(!any) {
-            continue;
-        }
-        for(j = i+1; j < total; ++j){
-            if (box_iou(boxes[i], boxes[j]) > thresh){
-                for(k = 0; k < classes; ++k){
-                    if (probs[i][k] < probs[j][k]) probs[i][k] = 0;
-                    else probs[j][k] = 0;
-                }
-            }
-        }
-    }
-}
-
-box encode_box(box b, box anchor)
-{
-    box encode;
-    encode.x = (b.x - anchor.x) / anchor.w;
-    encode.y = (b.y - anchor.y) / anchor.h;
-    encode.w = log2(b.w / anchor.w);
-    encode.h = log2(b.h / anchor.h);
-    return encode;
-}
-
-box decode_box(box b, box anchor)
-{
-    box decode;
-    decode.x = b.x * anchor.w + anchor.x;
-    decode.y = b.y * anchor.h + anchor.y;
-    decode.w = pow(2., b.w) * anchor.w;
-    decode.h = pow(2., b.h) * anchor.h;
-    return decode;
-}
--- a/src/box.cpp
+++ b/src/box.cpp
@ -0,0 +1,339 @@
+#include "box.h"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+
+int nms_comparator(const void *pa, const void *pb) {
+    detection a = *(detection *) pa;
+    detection b = *(detection *) pb;
+    float diff = 0;
+    if (b.sort_class >= 0) {
+        diff = a.prob[b.sort_class] - b.prob[b.sort_class];
+    } else {
+        diff = a.objectness - b.objectness;
+    }
+    if (diff < 0) return 1;
+    else if (diff > 0) return -1;
+    return 0;
+}
+
+void do_nms_obj(detection *dets, int total, int classes, float thresh) {
+    int i, j, k;
+    k = total - 1;
+    for (i = 0; i <= k; ++i) {
+        if (dets[i].objectness == 0) {
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k + 1;
+
+    for (i = 0; i < total; ++i) {
+        dets[i].sort_class = -1;
+    }
+
+    qsort(dets, total, sizeof(detection), nms_comparator);
+    for (i = 0; i < total; ++i) {
+        if (dets[i].objectness == 0) continue;
+        box a = dets[i].bbox;
+        for (j = i + 1; j < total; ++j) {
+            if (dets[j].objectness == 0) continue;
+            box b = dets[j].bbox;
+            if (box_iou(a, b) > thresh) {
+                dets[j].objectness = 0;
+                for (k = 0; k < classes; ++k) {
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+
+void do_nms_sort(detection *dets, int total, int classes, float thresh) {
+    int i, j, k;
+    k = total - 1;
+    for (i = 0; i <= k; ++i) {
+        if (dets[i].objectness == 0) {
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k + 1;
+
+    for (k = 0; k < classes; ++k) {
+        for (i = 0; i < total; ++i) {
+            dets[i].sort_class = k;
+        }
+        qsort(dets, total, sizeof(detection), nms_comparator);
+        for (i = 0; i < total; ++i) {
+            if (dets[i].prob[k] == 0) continue;
+            box a = dets[i].bbox;
+            for (j = i + 1; j < total; ++j) {
+                box b = dets[j].bbox;
+                if (box_iou(a, b) > thresh) {
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+box float_to_box(float *f, int stride) {
+    box b = {0};
+    b.x = f[0];
+    b.y = f[1 * stride];
+    b.w = f[2 * stride];
+    b.h = f[3 * stride];
+    return b;
+}
+
+dbox derivative(box a, box b) {
+    dbox d;
+    d.dx = 0;
+    d.dw = 0;
+    float l1 = a.x - a.w / 2;
+    float l2 = b.x - b.w / 2;
+    if (l1 > l2) {
+        d.dx -= 1;
+        d.dw += .5;
+    }
+    float r1 = a.x + a.w / 2;
+    float r2 = b.x + b.w / 2;
+    if (r1 < r2) {
+        d.dx += 1;
+        d.dw += .5;
+    }
+    if (l1 > r2) {
+        d.dx = -1;
+        d.dw = 0;
+    }
+    if (r1 < l2) {
+        d.dx = 1;
+        d.dw = 0;
+    }
+
+    d.dy = 0;
+    d.dh = 0;
+    float t1 = a.y - a.h / 2;
+    float t2 = b.y - b.h / 2;
+    if (t1 > t2) {
+        d.dy -= 1;
+        d.dh += .5;
+    }
+    float b1 = a.y + a.h / 2;
+    float b2 = b.y + b.h / 2;
+    if (b1 < b2) {
+        d.dy += 1;
+        d.dh += .5;
+    }
+    if (t1 > b2) {
+        d.dy = -1;
+        d.dh = 0;
+    }
+    if (b1 < t2) {
+        d.dy = 1;
+        d.dh = 0;
+    }
+    return d;
+}
+
+float overlap(float x1, float w1, float x2, float w2) {
+    float l1 = x1 - w1 / 2;
+    float l2 = x2 - w2 / 2;
+    float left = l1 > l2 ? l1 : l2;
+    float r1 = x1 + w1 / 2;
+    float r2 = x2 + w2 / 2;
+    float right = r1 < r2 ? r1 : r2;
+    return right - left;
+}
+
+float box_intersection(box a, box b) {
+    float w = overlap(a.x, a.w, b.x, b.w);
+    float h = overlap(a.y, a.h, b.y, b.h);
+    if (w < 0 || h < 0) return 0;
+    float area = w * h;
+    return area;
+}
+
+float box_union(box a, box b) {
+    float i = box_intersection(a, b);
+    float u = a.w * a.h + b.w * b.h - i;
+    return u;
+}
+
+float box_iou(box a, box b) {
+    return box_intersection(a, b) / box_union(a, b);
+}
+
+float box_rmse(box a, box b) {
+    return sqrt(pow(a.x - b.x, 2) +
+                pow(a.y - b.y, 2) +
+                pow(a.w - b.w, 2) +
+                pow(a.h - b.h, 2));
+}
+
+dbox dintersect(box a, box b) {
+    float w = overlap(a.x, a.w, b.x, b.w);
+    float h = overlap(a.y, a.h, b.y, b.h);
+    dbox dover = derivative(a, b);
+    dbox di;
+
+    di.dw = dover.dw * h;
+    di.dx = dover.dx * h;
+    di.dh = dover.dh * w;
+    di.dy = dover.dy * w;
+
+    return di;
+}
+
+dbox dunion(box a, box b) {
+    dbox du;
+
+    dbox di = dintersect(a, b);
+    du.dw = a.h - di.dw;
+    du.dh = a.w - di.dh;
+    du.dx = -di.dx;
+    du.dy = -di.dy;
+
+    return du;
+}
+
+
+void test_dunion() {
+    box a = {0, 0, 1, 1};
+    box dxa = {0 + .0001, 0, 1, 1};
+    box dya = {0, 0 + .0001, 1, 1};
+    box dwa = {0, 0, 1 + .0001, 1};
+    box dha = {0, 0, 1, 1 + .0001};
+
+    box b = {.5, .5, .2, .2};
+    dbox di = dunion(a, b);
+    printf("Union: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh);
+    float inter = box_union(a, b);
+    float xinter = box_union(dxa, b);
+    float yinter = box_union(dya, b);
+    float winter = box_union(dwa, b);
+    float hinter = box_union(dha, b);
+    xinter = (xinter - inter) / (.0001);
+    yinter = (yinter - inter) / (.0001);
+    winter = (winter - inter) / (.0001);
+    hinter = (hinter - inter) / (.0001);
+    printf("Union Manual %f %f %f %f\n", xinter, yinter, winter, hinter);
+}
+
+void test_dintersect() {
+    box a = {0, 0, 1, 1};
+    box dxa = {0 + .0001, 0, 1, 1};
+    box dya = {0, 0 + .0001, 1, 1};
+    box dwa = {0, 0, 1 + .0001, 1};
+    box dha = {0, 0, 1, 1 + .0001};
+
+    box b = {.5, .5, .2, .2};
+    dbox di = dintersect(a, b);
+    printf("Inter: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh);
+    float inter = box_intersection(a, b);
+    float xinter = box_intersection(dxa, b);
+    float yinter = box_intersection(dya, b);
+    float winter = box_intersection(dwa, b);
+    float hinter = box_intersection(dha, b);
+    xinter = (xinter - inter) / (.0001);
+    yinter = (yinter - inter) / (.0001);
+    winter = (winter - inter) / (.0001);
+    hinter = (hinter - inter) / (.0001);
+    printf("Inter Manual %f %f %f %f\n", xinter, yinter, winter, hinter);
+}
+
+void test_box() {
+    test_dintersect();
+    test_dunion();
+    box a = {0, 0, 1, 1};
+    box dxa = {0 + .00001, 0, 1, 1};
+    box dya = {0, 0 + .00001, 1, 1};
+    box dwa = {0, 0, 1 + .00001, 1};
+    box dha = {0, 0, 1, 1 + .00001};
+
+    box b = {.5, 0, .2, .2};
+
+    float iou = box_iou(a, b);
+    iou = (1 - iou) * (1 - iou);
+    printf("%f\n", iou);
+    dbox d = diou(a, b);
+    printf("%f %f %f %f\n", d.dx, d.dy, d.dw, d.dh);
+
+    float xiou = box_iou(dxa, b);
+    float yiou = box_iou(dya, b);
+    float wiou = box_iou(dwa, b);
+    float hiou = box_iou(dha, b);
+    xiou = ((1 - xiou) * (1 - xiou) - iou) / (.00001);
+    yiou = ((1 - yiou) * (1 - yiou) - iou) / (.00001);
+    wiou = ((1 - wiou) * (1 - wiou) - iou) / (.00001);
+    hiou = ((1 - hiou) * (1 - hiou) - iou) / (.00001);
+    printf("manual %f %f %f %f\n", xiou, yiou, wiou, hiou);
+}
+
+dbox diou(box a, box b) {
+    float u = box_union(a, b);
+    float i = box_intersection(a, b);
+    dbox di = dintersect(a, b);
+    dbox du = dunion(a, b);
+    dbox dd = {0, 0, 0, 0};
+
+    if (i <= 0 || 1) {
+        dd.dx = b.x - a.x;
+        dd.dy = b.y - a.y;
+        dd.dw = b.w - a.w;
+        dd.dh = b.h - a.h;
+        return dd;
+    }
+
+    dd.dx = 2 * pow((1 - (i / u)), 1) * (di.dx * u - du.dx * i) / (u * u);
+    dd.dy = 2 * pow((1 - (i / u)), 1) * (di.dy * u - du.dy * i) / (u * u);
+    dd.dw = 2 * pow((1 - (i / u)), 1) * (di.dw * u - du.dw * i) / (u * u);
+    dd.dh = 2 * pow((1 - (i / u)), 1) * (di.dh * u - du.dh * i) / (u * u);
+    return dd;
+}
+
+
+void do_nms(box *boxes, float **probs, int total, int classes, float thresh) {
+    int i, j, k;
+    for (i = 0; i < total; ++i) {
+        int any = 0;
+        for (k = 0; k < classes; ++k) any = any || (probs[i][k] > 0);
+        if (!any) {
+            continue;
+        }
+        for (j = i + 1; j < total; ++j) {
+            if (box_iou(boxes[i], boxes[j]) > thresh) {
+                for (k = 0; k < classes; ++k) {
+                    if (probs[i][k] < probs[j][k]) probs[i][k] = 0;
+                    else probs[j][k] = 0;
+                }
+            }
+        }
+    }
+}
+
+box encode_box(box b, box anchor) {
+    box encode;
+    encode.x = (b.x - anchor.x) / anchor.w;
+    encode.y = (b.y - anchor.y) / anchor.h;
+    encode.w = log2(b.w / anchor.w);
+    encode.h = log2(b.h / anchor.h);
+    return encode;
+}
+
+box decode_box(box b, box anchor) {
+    box decode;
+    decode.x = b.x * anchor.w + anchor.x;
+    decode.y = b.y * anchor.h + anchor.y;
+    decode.w = pow(2., b.w) * anchor.w;
+    decode.h = pow(2., b.h) * anchor.h;
+    return decode;
+}
--- a/src/box.h
+++ b/src/box.h
@ -1,14 +1,18 @@
 #ifndef BOX_H
 #define BOX_H
+
 #include "darknet.h"

-typedef struct{
+typedef struct {
    float dx, dy, dw, dh;
 } dbox;

 float box_rmse(box a, box b);
+
 dbox diou(box a, box b);
+
 box decode_box(box b, box anchor);
+
 box encode_box(box b, box anchor);

 #endif
--- a/src/col2im.cpp
+++ b/src/col2im.cpp
@ -1,23 +1,24 @@
 #include <stdio.h>
 #include <math.h>
+
 void col2im_add_pixel(float *im, int height, int width, int channels,
-                        int row, int col, int channel, int pad, float val)
-{
+                      int row, int col, int channel, int pad, float val) {
    row -= pad;
    col -= pad;

    if (row < 0 || col < 0 ||
-        row >= height || col >= width) return;
-    im[col + width*(row + height*channel)] += val;
+        row >= height || col >= width)
+        return;
+    im[col + width * (row + height * channel)] += val;
 }
+
 //This one might be too, can't remember.
-void col2im_cpu(float* data_col,
-         int channels,  int height,  int width,
-         int ksize,  int stride, int pad, float* data_im) 
-{
-    int c,h,w;
-    int height_col = (height + 2*pad - ksize) / stride + 1;
-    int width_col = (width + 2*pad - ksize) / stride + 1;
+void col2im_cpu(float *data_col,
+                int channels, int height, int width,
+                int ksize, int stride, int pad, float *data_im) {
+    int c, h, w;
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;

    int channels_col = channels * ksize * ksize;
    for (c = 0; c < channels_col; ++c) {
@ -31,7 +32,7 @@ void col2im_cpu(float* data_col,
                int col_index = (c * height_col + h) * width_col + w;
                double val = data_col[col_index];
                col2im_add_pixel(data_im, height, width, channels,
-                        im_row, im_col, c_im, pad, val);
+                                 im_row, im_col, c_im, pad, val);
            }
        }
    }
--- a/src/col2im.h
+++ b/src/col2im.h
@ -1,9 +1,9 @@
 #ifndef COL2IM_H
 #define COL2IM_H

-void col2im_cpu(float* data_col,
-        int channels, int height, int width,
-        int ksize, int stride, int pad, float* data_im);
+void col2im_cpu(float *data_col,
+                int channels, int height, int width,
+                int ksize, int stride, int pad, float *data_im);

 #ifdef GPU
 void col2im_gpu(float *data_col,
--- a/src/col2im_kernels.cu
+++ b/src/col2im_kernels.cu
@ -1,23 +1,25 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
+#include "hiprand.h"
+#include "hipblas.h"
+

-extern "C" {
 #include "col2im.h"
-#include "cuda.h"
-}
+
+#ifdef GPU
+#define BLOCK 512
+#include "hip/hip_runtime.h"
+#endif

 // src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
 // You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE

-__global__ void col2im_gpu_kernel(const int n, const float* data_col,
-        const int height, const int width, const int ksize,
-        const int pad,
-        const int stride,
-        const int height_col, const int width_col,
-        float *data_im) {
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
-    for(; index < n; index += blockDim.x*gridDim.x){
+__global__ void col2im_gpu_kernel(const int n, const float *data_col,
+                                  const int height, const int width, const int ksize,
+                                  const int pad,
+                                  const int stride,
+                                  const int height_col, const int width_col,
+                                  float *data_im) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; index < n; index += blockDim.x * gridDim.x) {
        float val = 0;
        int w = index % width + pad;
        int h = (index / width) % height + pad;
@ -29,7 +31,7 @@ __global__ void col2im_gpu_kernel(const int n, const float* data_col,
        int h_col_end = min(h / stride + 1, height_col);
        // equivalent implementation
        int offset =
-            (c * ksize * ksize + h * ksize + w) * height_col * width_col;
+                (c * ksize * ksize + h * ksize + w) * height_col * width_col;
        int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
        int coeff_w_col = (1 - stride * height_col * width_col);
        for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
@ -42,17 +44,17 @@ __global__ void col2im_gpu_kernel(const int n, const float* data_col,
 }

 void col2im_gpu(float *data_col,
-        int channels, int height, int width,
-        int ksize, int stride, int pad, float *data_im){
+                int channels, int height, int width,
+                int ksize, int stride, int pad, float *data_im) {
    // We are going to launch channels * height_col * width_col kernels, each
    // kernel responsible for copying a single-channel grid.
    int height_col = (height + 2 * pad - ksize) / stride + 1;
    int width_col = (width + 2 * pad - ksize) / stride + 1;
    int num_kernels = channels * height * width;
-    col2im_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
-        BLOCK>>>(
-                num_kernels, data_col, height, width, ksize, pad,
-                stride, height_col,
-                width_col, data_im);
+    col2im_gpu_kernel<<<(num_kernels + BLOCK - 1) / BLOCK,
+            BLOCK>>>(
+            num_kernels, data_col, height, width, ksize, pad,
+                    stride, height_col,
+                    width_col, data_im);
 }

--- a/src/compare.cpp
+++ b/src/compare.cpp
@ -7,21 +7,20 @@
 #include "parser.h"
 #include "box.h"

-void train_compare(char *cfgfile, char *weightfile)
-{
+void train_compare(char *cfgfile, char *weightfile) {
    srand(time(0));
    float avg_loss = -1;
    char *base = basecfg(cfgfile);
    char *backup_directory = "/home/pjreddie/backup/";
    printf("%s\n", base);
    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
+    if (weightfile) {
        load_weights(&net, weightfile);
    }
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    int imgs = 1024;
    list *plist = get_paths("data/compare.train.list");
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);
    int N = plist->size;
    printf("%d\n", N);
    clock_t time;
@ -40,64 +39,64 @@ void train_compare(char *cfgfile, char *weightfile)
    args.type = COMPARE_DATA;

    load_thread = load_data_in_thread(args);
-    int epoch = *net.seen/N;
+    int epoch = *net.seen / N;
    int i = 0;
-    while(1){
+    while (1) {
        ++i;
-        time=clock();
+        time = clock();
        pthread_join(load_thread, 0);
        train = buffer;

        load_thread = load_data_in_thread(args);
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-        time=clock();
+        printf("Loaded: %lf seconds\n", sec(clock() - time));
+        time = clock();
        float loss = train_network(net, train);
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%.3f: %f, %f avg, %lf seconds, %ld images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
+        if (avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss * .9 + loss * .1;
+        printf("%.3f: %f, %f avg, %lf seconds, %ld images\n", (float) *net.seen / N, loss, avg_loss,
+               sec(clock() - time), *net.seen);
        free_data(train);
-        if(i%100 == 0){
+        if (i % 100 == 0) {
            char buff[256];
-            sprintf(buff, "%s/%s_%d_minor_%d.weights",backup_directory,base, epoch, i);
+            sprintf(buff, "%s/%s_%d_minor_%d.weights", backup_directory, base, epoch, i);
            save_weights(net, buff);
        }
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
+        if (*net.seen / N > epoch) {
+            epoch = *net.seen / N;
            i = 0;
            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, epoch);
            save_weights(net, buff);
-            if(epoch%22 == 0) net.learning_rate *= .1;
+            if (epoch % 22 == 0) net.learning_rate *= .1;
        }
    }
    pthread_join(load_thread, 0);
    free_data(buffer);
    free_network(net);
-    free_ptrs((void**)paths, plist->size);
+    free_ptrs((void **) paths, plist->size);
    free_list(plist);
    free(base);
 }

-void validate_compare(char *filename, char *weightfile)
-{
+void validate_compare(char *filename, char *weightfile) {
    int i = 0;
    network net = parse_network_cfg(filename);
-    if(weightfile){
+    if (weightfile) {
        load_weights(&net, weightfile);
    }
    srand(time(0));

    list *plist = get_paths("data/compare.val.list");
    //list *plist = get_paths("data/compare.val.old");
-    char **paths = (char **)list_to_array(plist);
-    int N = plist->size/2;
+    char **paths = (char **) list_to_array(plist);
+    int N = plist->size / 2;
    free_list(plist);

    clock_t time;
    int correct = 0;
    int total = 0;
    int splits = 10;
-    int num = (i+1)*N/splits - i*N/splits;
+    int num = (i + 1) * N / splits - i * N / splits;

    data val, buffer;

@ -112,35 +111,36 @@ void validate_compare(char *filename, char *weightfile)
    args.type = COMPARE_DATA;

    pthread_t load_thread = load_data_in_thread(args);
-    for(i = 1; i <= splits; ++i){
-        time=clock();
+    for (i = 1; i <= splits; ++i) {
+        time = clock();

        pthread_join(load_thread, 0);
        val = buffer;

-        num = (i+1)*N/splits - i*N/splits;
-        char **part = paths+(i*N/splits);
-        if(i != splits){
+        num = (i + 1) * N / splits - i * N / splits;
+        char **part = paths + (i * N / splits);
+        if (i != splits) {
            args.paths = part;
            load_thread = load_data_in_thread(args);
        }
-        printf("Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time));
+        printf("Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock() - time));

-        time=clock();
+        time = clock();
        matrix pred = network_predict_data(net, val);
-        int j,k;
-        for(j = 0; j < val.y.rows; ++j){
-            for(k = 0; k < 20; ++k){
-                if(val.y.vals[j][k*2] != val.y.vals[j][k*2+1]){
+        int j, k;
+        for (j = 0; j < val.y.rows; ++j) {
+            for (k = 0; k < 20; ++k) {
+                if (val.y.vals[j][k * 2] != val.y.vals[j][k * 2 + 1]) {
                    ++total;
-                    if((val.y.vals[j][k*2] < val.y.vals[j][k*2+1]) == (pred.vals[j][k*2] < pred.vals[j][k*2+1])){
+                    if ((val.y.vals[j][k * 2] < val.y.vals[j][k * 2 + 1]) ==
+                        (pred.vals[j][k * 2] < pred.vals[j][k * 2 + 1])) {
                        ++correct;
                    }
                }
            }
        }
        free_matrix(pred);
-        printf("%d: Acc: %f, %lf seconds, %d images\n", i, (float)correct/total, sec(clock()-time), val.X.rows);
+        printf("%d: Acc: %f, %lf seconds, %d images\n", i, (float) correct / total, sec(clock() - time), val.X.rows);
        free_data(val);
    }
 }
@ -148,7 +148,7 @@ void validate_compare(char *filename, char *weightfile)
 typedef struct {
    network net;
    char *filename;
-    int class;
+    int nclass;
    int classes;
    float elo;
    float *elos;
@ -157,78 +157,73 @@ typedef struct {
 int total_compares = 0;
 int current_class = 0;

-int elo_comparator(const void*a, const void *b)
-{
-    sortable_bbox box1 = *(sortable_bbox*)a;
-    sortable_bbox box2 = *(sortable_bbox*)b;
-    if(box1.elos[current_class] == box2.elos[current_class]) return 0;
-    if(box1.elos[current_class] >  box2.elos[current_class]) return -1;
+int elo_comparator(const void *a, const void *b) {
+    sortable_bbox box1 = *(sortable_bbox *) a;
+    sortable_bbox box2 = *(sortable_bbox *) b;
+    if (box1.elos[current_class] == box2.elos[current_class]) return 0;
+    if (box1.elos[current_class] > box2.elos[current_class]) return -1;
    return 1;
 }

-int bbox_comparator(const void *a, const void *b)
-{
+int bbox_comparator(const void *a, const void *b) {
    ++total_compares;
-    sortable_bbox box1 = *(sortable_bbox*)a;
-    sortable_bbox box2 = *(sortable_bbox*)b;
+    sortable_bbox box1 = *(sortable_bbox *) a;
+    sortable_bbox box2 = *(sortable_bbox *) b;
    network net = box1.net;
-    int class   = box1.class;
+    int nclass = box1.nclass;

    image im1 = load_image_color(box1.filename, net.w, net.h);
    image im2 = load_image_color(box2.filename, net.w, net.h);
-    float *X  = calloc(net.w*net.h*net.c, sizeof(float));
-    memcpy(X,                   im1.data, im1.w*im1.h*im1.c*sizeof(float));
-    memcpy(X+im1.w*im1.h*im1.c, im2.data, im2.w*im2.h*im2.c*sizeof(float));
+    float *X = (float *) calloc(net.w * net.h * net.c, sizeof(float));
+    memcpy(X, im1.data, im1.w * im1.h * im1.c * sizeof(float));
+    memcpy(X + im1.w * im1.h * im1.c, im2.data, im2.w * im2.h * im2.c * sizeof(float));
    float *predictions = network_predict(net, X);
-    
+
    free_image(im1);
    free_image(im2);
    free(X);
-    if (predictions[class*2] > predictions[class*2+1]){
+    if (predictions[nclass * 2] > predictions[nclass * 2 + 1]) {
        return 1;
    }
    return -1;
 }

-void bbox_update(sortable_bbox *a, sortable_bbox *b, int class, int result)
-{
+void bbox_update(sortable_bbox *a, sortable_bbox *b, int nclass, int result) {
    int k = 32;
-    float EA = 1./(1+pow(10, (b->elos[class] - a->elos[class])/400.));
-    float EB = 1./(1+pow(10, (a->elos[class] - b->elos[class])/400.));
+    float EA = 1. / (1 + pow(10, (b->elos[nclass] - a->elos[nclass]) / 400.));
+    float EB = 1. / (1 + pow(10, (a->elos[nclass] - b->elos[nclass]) / 400.));
    float SA = result ? 1 : 0;
    float SB = result ? 0 : 1;
-    a->elos[class] += k*(SA - EA);
-    b->elos[class] += k*(SB - EB);
+    a->elos[nclass] += k * (SA - EA);
+    b->elos[nclass] += k * (SB - EB);
 }

-void bbox_fight(network net, sortable_bbox *a, sortable_bbox *b, int classes, int class)
-{
+void bbox_fight(network net, sortable_bbox *a, sortable_bbox *b, int classes, int nclass) {
    image im1 = load_image_color(a->filename, net.w, net.h);
    image im2 = load_image_color(b->filename, net.w, net.h);
-    float *X  = calloc(net.w*net.h*net.c, sizeof(float));
-    memcpy(X,                   im1.data, im1.w*im1.h*im1.c*sizeof(float));
-    memcpy(X+im1.w*im1.h*im1.c, im2.data, im2.w*im2.h*im2.c*sizeof(float));
+    float *X = (float *) calloc(net.w * net.h * net.c, sizeof(float));
+    memcpy(X, im1.data, im1.w * im1.h * im1.c * sizeof(float));
+    memcpy(X + im1.w * im1.h * im1.c, im2.data, im2.w * im2.h * im2.c * sizeof(float));
    float *predictions = network_predict(net, X);
    ++total_compares;

    int i;
-    for(i = 0; i < classes; ++i){
-        if(class < 0 || class == i){
-            int result = predictions[i*2] > predictions[i*2+1];
+    for (i = 0; i < classes; ++i) {
+        if (nclass < 0 || nclass == i) {
+            int result = predictions[i * 2] > predictions[i * 2 + 1];
            bbox_update(a, b, i, result);
        }
    }
-    
+
    free_image(im1);
    free_image(im2);
    free(X);
 }

-void SortMaster3000(char *filename, char *weightfile)
-{
+void SortMaster3000(char *filename, char *weightfile) {
    int i = 0;
    network net = parse_network_cfg(filename);
-    if(weightfile){
+    if (weightfile) {
        load_weights(&net, weightfile);
    }
    srand(time(0));
@ -236,31 +231,30 @@ void SortMaster3000(char *filename, char *weightfile)

    list *plist = get_paths("data/compare.sort.list");
    //list *plist = get_paths("data/compare.val.old");
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);
    int N = plist->size;
    free_list(plist);
-    sortable_bbox *boxes = calloc(N, sizeof(sortable_bbox));
+    sortable_bbox *boxes = (sortable_bbox *) calloc(N, sizeof(sortable_bbox));
    printf("Sorting %d boxes...\n", N);
-    for(i = 0; i < N; ++i){
+    for (i = 0; i < N; ++i) {
        boxes[i].filename = paths[i];
        boxes[i].net = net;
-        boxes[i].class = 7;
+        boxes[i].nclass = 7;
        boxes[i].elo = 1500;
    }
-    clock_t time=clock();
+    clock_t time = clock();
    qsort(boxes, N, sizeof(sortable_bbox), bbox_comparator);
-    for(i = 0; i < N; ++i){
+    for (i = 0; i < N; ++i) {
        printf("%s\n", boxes[i].filename);
    }
-    printf("Sorted in %d compares, %f secs\n", total_compares, sec(clock()-time));
+    printf("Sorted in %d compares, %f secs\n", total_compares, sec(clock() - time));
 }

-void BattleRoyaleWithCheese(char *filename, char *weightfile)
-{
+void BattleRoyaleWithCheese(char *filename, char *weightfile) {
    int classes = 20;
-    int i,j;
+    int i, j;
    network net = parse_network_cfg(filename);
-    if(weightfile){
+    if (weightfile) {
        load_weights(&net, weightfile);
    }
    srand(time(0));
@ -270,69 +264,68 @@ void BattleRoyaleWithCheese(char *filename, char *weightfile)
    //list *plist = get_paths("data/compare.small.list");
    //list *plist = get_paths("data/compare.cat.list");
    //list *plist = get_paths("data/compare.val.old");
-    char **paths = (char **)list_to_array(plist);
+    char **paths = (char **) list_to_array(plist);
    int N = plist->size;
    int total = N;
    free_list(plist);
-    sortable_bbox *boxes = calloc(N, sizeof(sortable_bbox));
+    sortable_bbox *boxes = (sortable_bbox *) calloc(N, sizeof(sortable_bbox));
    printf("Battling %d boxes...\n", N);
-    for(i = 0; i < N; ++i){
+    for (i = 0; i < N; ++i) {
        boxes[i].filename = paths[i];
        boxes[i].net = net;
        boxes[i].classes = classes;
-        boxes[i].elos = calloc(classes, sizeof(float));;
-        for(j = 0; j < classes; ++j){
+        boxes[i].elos = (float *) calloc(classes, sizeof(float));;
+        for (j = 0; j < classes; ++j) {
            boxes[i].elos[j] = 1500;
        }
    }
    int round;
-    clock_t time=clock();
-    for(round = 1; round <= 4; ++round){
-        clock_t round_time=clock();
+    clock_t time = clock();
+    for (round = 1; round <= 4; ++round) {
+        clock_t round_time = clock();
        printf("Round: %d\n", round);
        shuffle(boxes, N, sizeof(sortable_bbox));
-        for(i = 0; i < N/2; ++i){
-            bbox_fight(net, boxes+i*2, boxes+i*2+1, classes, -1);
+        for (i = 0; i < N / 2; ++i) {
+            bbox_fight(net, boxes + i * 2, boxes + i * 2 + 1, classes, -1);
        }
-        printf("Round: %f secs, %d remaining\n", sec(clock()-round_time), N);
+        printf("Round: %f secs, %d remaining\n", sec(clock() - round_time), N);
    }

-    int class;
+    int nclass;

-    for (class = 0; class < classes; ++class){
+    for (nclass = 0; nclass < classes; ++nclass) {

        N = total;
-        current_class = class;
+        current_class = nclass;
        qsort(boxes, N, sizeof(sortable_bbox), elo_comparator);
        N /= 2;

-        for(round = 1; round <= 100; ++round){
-            clock_t round_time=clock();
+        for (round = 1; round <= 100; ++round) {
+            clock_t round_time = clock();
            printf("Round: %d\n", round);

            sorta_shuffle(boxes, N, sizeof(sortable_bbox), 10);
-            for(i = 0; i < N/2; ++i){
-                bbox_fight(net, boxes+i*2, boxes+i*2+1, classes, class);
+            for (i = 0; i < N / 2; ++i) {
+                bbox_fight(net, boxes + i * 2, boxes + i * 2 + 1, classes, nclass);
            }
            qsort(boxes, N, sizeof(sortable_bbox), elo_comparator);
-            if(round <= 20) N = (N*9/10)/2*2;
+            if (round <= 20) N = (N * 9 / 10) / 2 * 2;

-            printf("Round: %f secs, %d remaining\n", sec(clock()-round_time), N);
+            printf("Round: %f secs, %d remaining\n", sec(clock() - round_time), N);
        }
        char buff[256];
-        sprintf(buff, "results/battle_%d.log", class);
+        sprintf(buff, "results/battle_%d.log", nclass);
        FILE *outfp = fopen(buff, "w");
-        for(i = 0; i < N; ++i){
-            fprintf(outfp, "%s %f\n", boxes[i].filename, boxes[i].elos[class]);
+        for (i = 0; i < N; ++i) {
+            fprintf(outfp, "%s %f\n", boxes[i].filename, boxes[i].elos[nclass]);
        }
        fclose(outfp);
    }
-    printf("Tournament in %d compares, %f secs\n", total_compares, sec(clock()-time));
+    printf("Tournament in %d compares, %f secs\n", total_compares, sec(clock() - time));
 }

-void run_compare(int argc, char **argv)
-{
-    if(argc < 4){
+void run_compare(int argc, char **argv) {
+    if (argc < 4) {
        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
        return;
    }
@ -340,10 +333,10 @@ void run_compare(int argc, char **argv)
    char *cfg = argv[3];
    char *weights = (argc > 4) ? argv[4] : 0;
    //char *filename = (argc > 5) ? argv[5]: 0;
-    if(0==strcmp(argv[2], "train")) train_compare(cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) validate_compare(cfg, weights);
-    else if(0==strcmp(argv[2], "sort")) SortMaster3000(cfg, weights);
-    else if(0==strcmp(argv[2], "battle")) BattleRoyaleWithCheese(cfg, weights);
+    if (0 == strcmp(argv[2], "train")) train_compare(cfg, weights);
+    else if (0 == strcmp(argv[2], "valid")) validate_compare(cfg, weights);
+    else if (0 == strcmp(argv[2], "sort")) SortMaster3000(cfg, weights);
+    else if (0 == strcmp(argv[2], "battle")) BattleRoyaleWithCheese(cfg, weights);
    /*
       else if(0==strcmp(argv[2], "train")) train_coco(cfg, weights);
       else if(0==strcmp(argv[2], "extract")) extract_boxes(cfg, weights);
--- a/src/connected_layer.cpp
+++ b/src/connected_layer.cpp
@ -2,7 +2,7 @@
 #include "convolutional_layer.h"
 #include "batchnorm_layer.h"
 #include "utils.h"
-#include "cuda.h"
+
 #include "blas.h"
 #include "gemm.h"

@ -11,16 +11,19 @@
 #include <stdlib.h>
 #include <string.h>

-layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
-{
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam) {
    int i;
-    layer l = {0};
+    layer l = {(LAYER_TYPE) 0};
    l.learning_rate_scale = 1;
    l.type = CONNECTED;

    l.inputs = inputs;
    l.outputs = outputs;
-    l.batch=batch;
+    l.batch = batch;
    l.batch_normalize = batch_normalize;
    l.h = 1;
    l.w = 1;
@ -29,54 +32,54 @@ layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activa
    l.out_w = 1;
    l.out_c = outputs;

-    l.output = calloc(batch*outputs, sizeof(float));
-    l.delta = calloc(batch*outputs, sizeof(float));
+    l.output = (float *) calloc(batch * outputs, sizeof(float));
+    l.delta = (float*) calloc(batch * outputs, sizeof(float));

-    l.weight_updates = calloc(inputs*outputs, sizeof(float));
-    l.bias_updates = calloc(outputs, sizeof(float));
+    l.weight_updates = (float*) calloc(inputs * outputs, sizeof(float));
+    l.bias_updates = (float*) calloc(outputs, sizeof(float));

-    l.weights = calloc(outputs*inputs, sizeof(float));
-    l.biases = calloc(outputs, sizeof(float));
+    l.weights = (float*) calloc(outputs * inputs, sizeof(float));
+    l.biases = (float*) calloc(outputs, sizeof(float));

    l.forward = forward_connected_layer;
    l.backward = backward_connected_layer;
    l.update = update_connected_layer;

    //float scale = 1./sqrt(inputs);
-    float scale = sqrt(2./inputs);
-    for(i = 0; i < outputs*inputs; ++i){
-        l.weights[i] = scale*rand_uniform(-1, 1);
+    float scale = sqrt(2. / inputs);
+    for (i = 0; i < outputs * inputs; ++i) {
+        l.weights[i] = scale * rand_uniform(-1, 1);
    }

-    for(i = 0; i < outputs; ++i){
+    for (i = 0; i < outputs; ++i) {
        l.biases[i] = 0;
    }

-    if(adam){
-        l.m = calloc(l.inputs*l.outputs, sizeof(float));
-        l.v = calloc(l.inputs*l.outputs, sizeof(float));
-        l.bias_m = calloc(l.outputs, sizeof(float));
-        l.scale_m = calloc(l.outputs, sizeof(float));
-        l.bias_v = calloc(l.outputs, sizeof(float));
-        l.scale_v = calloc(l.outputs, sizeof(float));
+    if (adam) {
+        l.m = (float *) calloc(l.inputs * l.outputs, sizeof(float));
+        l.v = (float *) calloc(l.inputs * l.outputs, sizeof(float));
+        l.bias_m = (float *) calloc(l.outputs, sizeof(float));
+        l.scale_m = (float *) calloc(l.outputs, sizeof(float));
+        l.bias_v = (float *) calloc(l.outputs, sizeof(float));
+        l.scale_v = (float *) calloc(l.outputs, sizeof(float));
    }
-    if(batch_normalize){
-        l.scales = calloc(outputs, sizeof(float));
-        l.scale_updates = calloc(outputs, sizeof(float));
-        for(i = 0; i < outputs; ++i){
+    if (batch_normalize) {
+        l.scales = (float *) calloc(outputs, sizeof(float));
+        l.scale_updates = (float *) calloc(outputs, sizeof(float));
+        for (i = 0; i < outputs; ++i) {
            l.scales[i] = 1;
        }

-        l.mean = calloc(outputs, sizeof(float));
-        l.mean_delta = calloc(outputs, sizeof(float));
-        l.variance = calloc(outputs, sizeof(float));
-        l.variance_delta = calloc(outputs, sizeof(float));
+        l.mean = (float *) calloc(outputs, sizeof(float));
+        l.mean_delta = (float *) calloc(outputs, sizeof(float));
+        l.variance = (float *) calloc(outputs, sizeof(float));
+        l.variance_delta = (float *) calloc(outputs, sizeof(float));

-        l.rolling_mean = calloc(outputs, sizeof(float));
-        l.rolling_variance = calloc(outputs, sizeof(float));
+        l.rolling_mean = (float *) calloc(outputs, sizeof(float));
+        l.rolling_variance = (float *) calloc(outputs, sizeof(float));

-        l.x = calloc(batch*outputs, sizeof(float));
-        l.x_norm = calloc(batch*outputs, sizeof(float));
+        l.x = (float *) calloc(batch * outputs, sizeof(float));
+        l.x_norm = (float *) calloc(batch * outputs, sizeof(float));
    }

 #ifdef GPU
@ -117,10 +120,10 @@ layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activa
        l.x_gpu = cuda_make_array(l.output, l.batch*outputs);
        l.x_norm_gpu = cuda_make_array(l.output, l.batch*outputs);
 #ifdef CUDNN
-        cudnnCreateTensorDescriptor(&l.normTensorDesc);
-        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
-        cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
-        cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
+        hipdnnCreateTensorDescriptor(&l.normTensorDesc);
+        hipdnnCreateTensorDescriptor(&l.dstTensorDesc);
+        hipdnnSetTensor4dDescriptor(l.dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+        hipdnnSetTensor4dDescriptor(l.normTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
 #endif
    }
 #endif
@ -129,48 +132,45 @@ layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activa
    return l;
 }

-void update_connected_layer(layer l, update_args a)
-{
-    float learning_rate = a.learning_rate*l.learning_rate_scale;
+void update_connected_layer(layer l, update_args a) {
+    float learning_rate = a.learning_rate * l.learning_rate_scale;
    float momentum = a.momentum;
    float decay = a.decay;
    int batch = a.batch;
-    axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
+    axpy_cpu(l.outputs, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
    scal_cpu(l.outputs, momentum, l.bias_updates, 1);

-    if(l.batch_normalize){
-        axpy_cpu(l.outputs, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
+    if (l.batch_normalize) {
+        axpy_cpu(l.outputs, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
        scal_cpu(l.outputs, momentum, l.scale_updates, 1);
    }

-    axpy_cpu(l.inputs*l.outputs, -decay*batch, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
-    scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
+    axpy_cpu(l.inputs * l.outputs, -decay * batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(l.inputs * l.outputs, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(l.inputs * l.outputs, momentum, l.weight_updates, 1);
 }

-void forward_connected_layer(layer l, network net)
-{
-    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+void forward_connected_layer(layer l, network net) {
+    fill_cpu(l.outputs * l.batch, 0, l.output, 1);
    int m = l.batch;
    int k = l.inputs;
    int n = l.outputs;
    float *a = net.input;
    float *b = l.weights;
    float *c = l.output;
-    gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
-    if(l.batch_normalize){
+    gemm(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
+    if (l.batch_normalize) {
        forward_batchnorm_layer(l, net);
    } else {
        add_bias(l.output, l.biases, l.batch, l.outputs, 1);
    }
-    activate_array(l.output, l.outputs*l.batch, l.activation);
+    activate_array(l.output, l.outputs * l.batch, l.activation);
 }

-void backward_connected_layer(layer l, network net)
-{
-    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+void backward_connected_layer(layer l, network net) {
+    gradient_array(l.output, l.outputs * l.batch, l.activation, l.delta);

-    if(l.batch_normalize){
+    if (l.batch_normalize) {
        backward_batchnorm_layer(l, net);
    } else {
        backward_bias(l.bias_updates, l.delta, l.batch, l.outputs, 1);
@ -182,7 +182,7 @@ void backward_connected_layer(layer l, network net)
    float *a = l.delta;
    float *b = net.input;
    float *c = l.weight_updates;
-    gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
+    gemm(1, 0, m, n, k, 1, a, m, b, n, 1, c, n);

    m = l.batch;
    k = l.outputs;
@ -192,17 +192,16 @@ void backward_connected_layer(layer l, network net)
    b = l.weights;
    c = net.delta;

-    if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+    if (c) gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
 }


-void denormalize_connected_layer(layer l)
-{
+void denormalize_connected_layer(layer l) {
    int i, j;
-    for(i = 0; i < l.outputs; ++i){
-        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001);
-        for(j = 0; j < l.inputs; ++j){
-            l.weights[i*l.inputs + j] *= scale;
+    for (i = 0; i < l.outputs; ++i) {
+        float scale = l.scales[i] / sqrt(l.rolling_variance[i] + .000001);
+        for (j = 0; j < l.inputs; ++j) {
+            l.weights[i * l.inputs + j] *= scale;
        }
        l.biases[i] -= l.rolling_mean[i] * scale;
        l.scales[i] = 1;
@ -212,9 +211,8 @@ void denormalize_connected_layer(layer l)
 }


-void statistics_connected_layer(layer l)
-{
-    if(l.batch_normalize){
+void statistics_connected_layer(layer l) {
+    if (l.batch_normalize) {
        printf("Scales ");
        print_statistics(l.scales, l.outputs);
        /*
--- a/src/connected_layer.h
+++ b/src/connected_layer.h
@ -8,7 +8,9 @@
 layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam);

 void forward_connected_layer(layer l, network net);
+
 void backward_connected_layer(layer l, network net);
+
 void update_connected_layer(layer l, update_args a);

 #ifdef GPU
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@ -1,8 +1,9 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
+#include "hip/hip_runtime.h"
+#include "hiprand.h"
+#include "hipblas.h"
+
+

-extern "C" {
 #include "convolutional_layer.h"
 #include "batchnorm_layer.h"
 #include "gemm.h"
@ -11,83 +12,78 @@ extern "C" {
 #include "col2im.h"
 #include "utils.h"
 #include "cuda.h"
-}

-__global__ void binarize_kernel(float *x, int n, float *binary)
-{
-    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+//}
+#define BLOCK 512
+
+__global__ void binarize_kernel(float *x, int n, float *binary) {
+    int i = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
    if (i >= n) return;
    binary[i] = (x[i] >= 0) ? 1 : -1;
 }

-void binarize_gpu(float *x, int n, float *binary)
-{
+void binarize_gpu(float *x, int n, float *binary) {
    binarize_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, binary);
-    check_error(cudaPeekAtLastError());
+    check_error(hipPeekAtLastError());
 }

-__global__ void binarize_input_kernel(float *input, int n, int size, float *binary)
-{
-    int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+__global__ void binarize_input_kernel(float *input, int n, int size, float *binary) {
+    int s = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
    if (s >= size) return;
    int i = 0;
    float mean = 0;
-    for(i = 0; i < n; ++i){
-        mean += fabsf(input[i*size + s]);
+    for (i = 0; i < n; ++i) {
+        mean += fabsf(input[i * size + s]);
    }
    mean = mean / n;
-    for(i = 0; i < n; ++i){
-        binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
+    for (i = 0; i < n; ++i) {
+        binary[i * size + s] = (input[i * size + s] > 0) ? mean : -mean;
    }
 }

-void binarize_input_gpu(float *input, int n, int size, float *binary)
-{
+void binarize_input_gpu(float *input, int n, int size, float *binary) {
    binarize_input_kernel<<<cuda_gridsize(size), BLOCK>>>(input, n, size, binary);
-    check_error(cudaPeekAtLastError());
+    check_error(hipPeekAtLastError());
 }


-__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary)
-{
-    int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary) {
+    int f = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
    if (f >= n) return;
    int i = 0;
    float mean = 0;
-    for(i = 0; i < size; ++i){
-        mean += fabsf(weights[f*size + i]);
+    for (i = 0; i < size; ++i) {
+        mean += fabsf(weights[f * size + i]);
    }
    mean = mean / size;
-    for(i = 0; i < size; ++i){
-        binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
+    for (i = 0; i < size; ++i) {
+        binary[f * size + i] = (weights[f * size + i] > 0) ? mean : -mean;
        //binary[f*size + i] = weights[f*size + i];
    }
 }

-void binarize_weights_gpu(float *weights, int n, int size, float *binary)
-{
+void binarize_weights_gpu(float *weights, int n, int size, float *binary) {
    binarize_weights_kernel<<<cuda_gridsize(n), BLOCK>>>(weights, n, size, binary);
-    check_error(cudaPeekAtLastError());
+    check_error(hipPeekAtLastError());
 }

-void forward_convolutional_layer_gpu(convolutional_layer l, network net)
-{
-    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
-    if(l.binary){
-        binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
+void forward_convolutional_layer_gpu(convolutional_layer l, network net) {
+    fill_gpu(l.outputs * l.batch, 0, l.output_gpu, 1);
+    if (l.binary) {
+        binarize_weights_gpu(l.weights_gpu, l.n, l.c / l.groups * l.size * l.size, l.binary_weights_gpu);
        swap_binary(&l);
    }

-    if(l.xnor){
-        binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
+    if (l.xnor) {
+        binarize_weights_gpu(l.weights_gpu, l.n, l.c / l.groups * l.size * l.size, l.binary_weights_gpu);
        swap_binary(&l);
-        binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
+        binarize_gpu(net.input_gpu, l.c * l.h * l.w * l.batch, l.binary_input_gpu);
        net.input_gpu = l.binary_input_gpu;
    }

 #ifdef CUDNN
    float one = 1;
-    cudnnConvolutionForward(cudnn_handle(),
+    hipdnnConvolutionForward(cudnn_handle(),
                &one,
                l.srcTensorDesc,
                net.input_gpu,
@ -103,22 +99,22 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)

 #else
    int i, j;
-    int m = l.n/l.groups;
-    int k = l.size*l.size*l.c/l.groups;
-    int n = l.out_w*l.out_h;
-    for(i = 0; i < l.batch; ++i){
-        for(j = 0; j < l.groups; ++j){
-            float *a = l.weights_gpu + j*l.nweights/l.groups;
+    int m = l.n / l.groups;
+    int k = l.size * l.size * l.c / l.groups;
+    int n = l.out_w * l.out_h;
+    for (i = 0; i < l.batch; ++i) {
+        for (j = 0; j < l.groups; ++j) {
+            float *a = l.weights_gpu + j * l.nweights / l.groups;
            float *b = net.workspace;
-            float *c = l.output_gpu + (i*l.groups + j)*n*m;
-            float *im = net.input_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+            float *c = l.output_gpu + (i * l.groups + j) * n * m;
+            float *im = net.input_gpu + (i * l.groups + j) * l.c / l.groups * l.h * l.w;

-            if (l.size == 1){
+            if (l.size == 1) {
                b = im;
            } else {
-                im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+                im2col_gpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
            }
-            gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            gemm_gpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
        }
    }
 #endif
@ -126,18 +122,17 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
    if (l.batch_normalize) {
        forward_batchnorm_layer_gpu(l, net);
    } else {
-        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w * l.out_h);
    }

-    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    activate_array_gpu(l.output_gpu, l.outputs * l.batch, l.activation);
    //if(l.dot > 0) dot_error_gpu(l);
-    if(l.binary || l.xnor) swap_binary(&l);
+    if (l.binary || l.xnor) swap_binary(&l);
 }

-__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta)
-{
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(id >= n) return;
+__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta) {
+    int id = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;

    int j = id % w;
    id /= w;
@ -147,55 +142,53 @@ __global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, fl
    id /= c;
    int b = id;

-    int w_offset = -(size/2.f);
-    int h_offset = -(size/2.f);
+    int w_offset = -(size / 2.f);
+    int h_offset = -(size / 2.f);

-    int out_index = j + w*(i + h*(k + c*b));
+    int out_index = j + w * (i + h * (k + c * b));
    int l, m;
-    for(l = 0; l < size; ++l){
-        for(m = 0; m < size; ++m){
+    for (l = 0; l < size; ++l) {
+        for (m = 0; m < size; ++m) {
            int cur_h = h_offset + i + l;
            int cur_w = w_offset + j + m;
-            int index = cur_w + w*(cur_h + h*(k + b*c));
+            int index = cur_w + w * (cur_h + h * (k + b * c));
            int valid = (cur_h >= 0 && cur_h < h &&
-                    cur_w >= 0 && cur_w < w);
-            delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0;
+                         cur_w >= 0 && cur_w < w);
+            delta[out_index] += valid ? rate * (x[index] - x[out_index]) : 0;
        }
    }
 }

-extern "C" void smooth_layer(layer l, int size, float rate)
-{
+void smooth_layer(layer l, int size, float rate) {
    int h = l.out_h;
    int w = l.out_w;
    int c = l.out_c;

-    size_t n = h*w*c*l.batch;
+    size_t n = h * w * c * l.batch;

    smooth_kernel<<<cuda_gridsize(n), BLOCK>>>(l.output_gpu, n, l.w, l.h, l.c, size, rate, l.delta_gpu);
-    check_error(cudaPeekAtLastError());
+    check_error(hipPeekAtLastError());
 }

-void backward_convolutional_layer_gpu(convolutional_layer l, network net)
-{
-    if(l.smooth){
+void backward_convolutional_layer_gpu(convolutional_layer l, network net) {
+    if (l.smooth) {
        smooth_layer(l, 5, l.smooth);
    }
    //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
-    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    gradient_array_gpu(l.output_gpu, l.outputs * l.batch, l.activation, l.delta_gpu);


-    if(l.batch_normalize){
+    if (l.batch_normalize) {
        backward_batchnorm_layer_gpu(l, net);
    } else {
-        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w * l.out_h);
    }
    float *original_input = net.input_gpu;

-    if(l.xnor) net.input_gpu = l.binary_input_gpu;
+    if (l.xnor) net.input_gpu = l.binary_input_gpu;
 #ifdef CUDNN
    float one = 1;
-    cudnnConvolutionBackwardFilter(cudnn_handle(),
+    hipdnnConvolutionBackwardFilter(cudnn_handle(),
            &one,
            l.srcTensorDesc,
            net.input_gpu,
@ -211,7 +204,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)

    if(net.delta_gpu){
        if(l.binary || l.xnor) swap_binary(&l);
-        cudnnConvolutionBackwardData(cudnn_handle(),
+        hipdnnConvolutionBackwardData(cudnn_handle(),
                &one,
                l.weightDesc,
                l.weights_gpu,
@ -229,100 +222,102 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
    }

 #else
-    int m = l.n/l.groups;
-    int n = l.size*l.size*l.c/l.groups;
-    int k = l.out_w*l.out_h;
+    int m = l.n / l.groups;
+    int n = l.size * l.size * l.c / l.groups;
+    int k = l.out_w * l.out_h;

    int i, j;
-    for(i = 0; i < l.batch; ++i){
-        for(j = 0; j < l.groups; ++j){
-            float *a = l.delta_gpu + (i*l.groups + j)*m*k;
+    for (i = 0; i < l.batch; ++i) {
+        for (j = 0; j < l.groups; ++j) {
+            float *a = l.delta_gpu + (i * l.groups + j) * m * k;
            float *b = net.workspace;
-            float *c = l.weight_updates_gpu + j*l.nweights/l.groups;
+            float *c = l.weight_updates_gpu + j * l.nweights / l.groups;

-            float *im  = net.input_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
-            float *imd = net.delta_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
+            float *im = net.input_gpu + (i * l.groups + j) * l.c / l.groups * l.h * l.w;
+            float *imd = net.delta_gpu + (i * l.groups + j) * l.c / l.groups * l.h * l.w;

-            im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
-            gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+            im2col_gpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            gemm_gpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);

            if (net.delta_gpu) {
                if (l.binary || l.xnor) swap_binary(&l);
-                a = l.weights_gpu + j*l.nweights/l.groups;
-                b = l.delta_gpu + (i*l.groups + j)*m*k;
+                a = l.weights_gpu + j * l.nweights / l.groups;
+                b = l.delta_gpu + (i * l.groups + j) * m * k;
                c = net.workspace;
                if (l.size == 1) {
                    c = imd;
                }

-                gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k);
+                gemm_gpu(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);

                if (l.size != 1) {
-                    col2im_gpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
+                    col2im_gpu(net.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
                }
-                if(l.binary || l.xnor) {
+                if (l.binary || l.xnor) {
                    swap_binary(&l);
                }
            }
-            if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
+            if (l.xnor)
+                gradient_array_gpu(original_input + i * l.c * l.h * l.w, l.c * l.h * l.w, HARDTAN,
+                                   net.delta_gpu + i * l.c * l.h * l.w);
        }
    }
 #endif
 }

-void pull_convolutional_layer(layer l)
-{
+void pull_convolutional_layer(layer l) {
    cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
    cuda_pull_array(l.biases_gpu, l.biases, l.n);
    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
    cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
-    if (l.batch_normalize){
+    if (l.batch_normalize) {
        cuda_pull_array(l.scales_gpu, l.scales, l.n);
        cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
        cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
    }
 }

-void push_convolutional_layer(layer l)
-{
+void push_convolutional_layer(layer l) {
    cuda_push_array(l.weights_gpu, l.weights, l.nweights);
    cuda_push_array(l.biases_gpu, l.biases, l.n);
    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
-    if (l.batch_normalize){
+    if (l.batch_normalize) {
        cuda_push_array(l.scales_gpu, l.scales, l.n);
        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
    }
 }

-void update_convolutional_layer_gpu(layer l, update_args a)
-{
-    float learning_rate = a.learning_rate*l.learning_rate_scale;
+void update_convolutional_layer_gpu(layer l, update_args a) {
+    float learning_rate = a.learning_rate * l.learning_rate_scale;
    float momentum = a.momentum;
    float decay = a.decay;
    int batch = a.batch;

-    if(a.adam){
-        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
-        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
-        if(l.scales_gpu){
-            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+    if (a.adam) {
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate,
+                        l.nweights, batch, a.t);
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay,
+                        learning_rate, l.n, batch, a.t);
+        if (l.scales_gpu) {
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay,
+                            learning_rate, l.n, batch, a.t);
        }
-    }else{
-        axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
-        axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+    } else {
+        axpy_gpu(l.nweights, -decay * batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
        scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);

-        axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        axpy_gpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
        scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);

-        if(l.scales_gpu){
-            axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+        if (l.scales_gpu) {
+            axpy_gpu(l.n, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
            scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
        }
    }
-    if(l.clip){
+    if (l.clip) {
        constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
    }
 }
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -1,622 +0,0 @@
-#include "convolutional_layer.h"
-#include "utils.h"
-#include "batchnorm_layer.h"
-#include "im2col.h"
-#include "col2im.h"
-#include "blas.h"
-#include "gemm.h"
-#include <stdio.h>
-#include <time.h>
-
-#ifdef AI2
-#include "xnor_layer.h"
-#endif
-
-void swap_binary(convolutional_layer *l)
-{
-    float *swap = l->weights;
-    l->weights = l->binary_weights;
-    l->binary_weights = swap;
-
-#ifdef GPU
-    swap = l->weights_gpu;
-    l->weights_gpu = l->binary_weights_gpu;
-    l->binary_weights_gpu = swap;
-#endif
-}
-
-void binarize_weights(float *weights, int n, int size, float *binary)
-{
-    int i, f;
-    for(f = 0; f < n; ++f){
-        float mean = 0;
-        for(i = 0; i < size; ++i){
-            mean += fabs(weights[f*size + i]);
-        }
-        mean = mean / size;
-        for(i = 0; i < size; ++i){
-            binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
-        }
-    }
-}
-
-void binarize_cpu(float *input, int n, float *binary)
-{
-    int i;
-    for(i = 0; i < n; ++i){
-        binary[i] = (input[i] > 0) ? 1 : -1;
-    }
-}
-
-void binarize_input(float *input, int n, int size, float *binary)
-{
-    int i, s;
-    for(s = 0; s < size; ++s){
-        float mean = 0;
-        for(i = 0; i < n; ++i){
-            mean += fabs(input[i*size + s]);
-        }
-        mean = mean / n;
-        for(i = 0; i < n; ++i){
-            binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
-        }
-    }
-}
-
-int convolutional_out_height(convolutional_layer l)
-{
-    return (l.h + 2*l.pad - l.size) / l.stride + 1;
-}
-
-int convolutional_out_width(convolutional_layer l)
-{
-    return (l.w + 2*l.pad - l.size) / l.stride + 1;
-}
-
-image get_convolutional_image(convolutional_layer l)
-{
-    return float_to_image(l.out_w,l.out_h,l.out_c,l.output);
-}
-
-image get_convolutional_delta(convolutional_layer l)
-{
-    return float_to_image(l.out_w,l.out_h,l.out_c,l.delta);
-}
-
-static size_t get_workspace_size(layer l){
-#ifdef CUDNN
-    if(gpu_index >= 0){
-        size_t most = 0;
-        size_t s = 0;
-        cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
-                l.srcTensorDesc,
-                l.weightDesc,
-                l.convDesc,
-                l.dstTensorDesc,
-                l.fw_algo,
-                &s);
-        if (s > most) most = s;
-        cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
-                l.srcTensorDesc,
-                l.ddstTensorDesc,
-                l.convDesc,
-                l.dweightDesc,
-                l.bf_algo,
-                &s);
-        if (s > most) most = s;
-        cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
-                l.weightDesc,
-                l.ddstTensorDesc,
-                l.convDesc,
-                l.dsrcTensorDesc,
-                l.bd_algo,
-                &s);
-        if (s > most) most = s;
-        return most;
-    }
-#endif
-    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c/l.groups*sizeof(float);
-}
-
-#ifdef GPU
-#ifdef CUDNN
-void cudnn_convolutional_setup(layer *l)
-{
-    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
-    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-
-    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
-    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
-
-    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
-    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
-    #if CUDNN_MAJOR >= 6
-    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
-    #else
-    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
-    #endif
-
-    #if CUDNN_MAJOR >= 7
-    cudnnSetConvolutionGroupCount(l->convDesc, l->groups);
-    #else
-    if(l->groups > 1){
-        error("CUDNN < 7 doesn't support groups, please upgrade!");
-    }
-    #endif
-
-    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
-            l->srcTensorDesc,
-            l->weightDesc,
-            l->convDesc,
-            l->dstTensorDesc,
-            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-            2000000000,
-            &l->fw_algo);
-    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
-            l->weightDesc,
-            l->ddstTensorDesc,
-            l->convDesc,
-            l->dsrcTensorDesc,
-            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-            2000000000,
-            &l->bd_algo);
-    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
-            l->srcTensorDesc,
-            l->ddstTensorDesc,
-            l->convDesc,
-            l->dweightDesc,
-            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-            2000000000,
-            &l->bf_algo);
-}
-#endif
-#endif
-
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
-{
-    int i;
-    convolutional_layer l = {0};
-    l.type = CONVOLUTIONAL;
-
-    l.groups = groups;
-    l.h = h;
-    l.w = w;
-    l.c = c;
-    l.n = n;
-    l.binary = binary;
-    l.xnor = xnor;
-    l.batch = batch;
-    l.stride = stride;
-    l.size = size;
-    l.pad = padding;
-    l.batch_normalize = batch_normalize;
-
-    l.weights = calloc(c/groups*n*size*size, sizeof(float));
-    l.weight_updates = calloc(c/groups*n*size*size, sizeof(float));
-
-    l.biases = calloc(n, sizeof(float));
-    l.bias_updates = calloc(n, sizeof(float));
-
-    l.nweights = c/groups*n*size*size;
-    l.nbiases = n;
-
-    // float scale = 1./sqrt(size*size*c);
-    float scale = sqrt(2./(size*size*c/l.groups));
-    //printf("convscale %f\n", scale);
-    //scale = .02;
-    //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
-    for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_normal();
-    int out_w = convolutional_out_width(l);
-    int out_h = convolutional_out_height(l);
-    l.out_h = out_h;
-    l.out_w = out_w;
-    l.out_c = n;
-    l.outputs = l.out_h * l.out_w * l.out_c;
-    l.inputs = l.w * l.h * l.c;
-
-    l.output = calloc(l.batch*l.outputs, sizeof(float));
-    l.delta  = calloc(l.batch*l.outputs, sizeof(float));
-
-    l.forward = forward_convolutional_layer;
-    l.backward = backward_convolutional_layer;
-    l.update = update_convolutional_layer;
-    if(binary){
-        l.binary_weights = calloc(l.nweights, sizeof(float));
-        l.cweights = calloc(l.nweights, sizeof(char));
-        l.scales = calloc(n, sizeof(float));
-    }
-    if(xnor){
-        l.binary_weights = calloc(l.nweights, sizeof(float));
-        l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
-    }
-
-    if(batch_normalize){
-        l.scales = calloc(n, sizeof(float));
-        l.scale_updates = calloc(n, sizeof(float));
-        for(i = 0; i < n; ++i){
-            l.scales[i] = 1;
-        }
-
-        l.mean = calloc(n, sizeof(float));
-        l.variance = calloc(n, sizeof(float));
-
-        l.mean_delta = calloc(n, sizeof(float));
-        l.variance_delta = calloc(n, sizeof(float));
-
-        l.rolling_mean = calloc(n, sizeof(float));
-        l.rolling_variance = calloc(n, sizeof(float));
-        l.x = calloc(l.batch*l.outputs, sizeof(float));
-        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
-    }
-    if(adam){
-        l.m = calloc(l.nweights, sizeof(float));
-        l.v = calloc(l.nweights, sizeof(float));
-        l.bias_m = calloc(n, sizeof(float));
-        l.scale_m = calloc(n, sizeof(float));
-        l.bias_v = calloc(n, sizeof(float));
-        l.scale_v = calloc(n, sizeof(float));
-    }
-
-#ifdef GPU
-    l.forward_gpu = forward_convolutional_layer_gpu;
-    l.backward_gpu = backward_convolutional_layer_gpu;
-    l.update_gpu = update_convolutional_layer_gpu;
-
-    if(gpu_index >= 0){
-        if (adam) {
-            l.m_gpu = cuda_make_array(l.m, l.nweights);
-            l.v_gpu = cuda_make_array(l.v, l.nweights);
-            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
-            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
-            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
-            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
-        }
-
-        l.weights_gpu = cuda_make_array(l.weights, l.nweights);
-        l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
-
-        l.biases_gpu = cuda_make_array(l.biases, n);
-        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
-
-        l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
-        l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
-
-        if(binary){
-            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
-        }
-        if(xnor){
-            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
-            l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
-        }
-
-        if(batch_normalize){
-            l.mean_gpu = cuda_make_array(l.mean, n);
-            l.variance_gpu = cuda_make_array(l.variance, n);
-
-            l.rolling_mean_gpu = cuda_make_array(l.mean, n);
-            l.rolling_variance_gpu = cuda_make_array(l.variance, n);
-
-            l.mean_delta_gpu = cuda_make_array(l.mean, n);
-            l.variance_delta_gpu = cuda_make_array(l.variance, n);
-
-            l.scales_gpu = cuda_make_array(l.scales, n);
-            l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
-
-            l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
-            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
-        }
-#ifdef CUDNN
-        cudnnCreateTensorDescriptor(&l.normTensorDesc);
-        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
-        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
-        cudnnCreateFilterDescriptor(&l.weightDesc);
-        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
-        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
-        cudnnCreateFilterDescriptor(&l.dweightDesc);
-        cudnnCreateConvolutionDescriptor(&l.convDesc);
-        cudnn_convolutional_setup(&l);
-#endif
-    }
-#endif
-    l.workspace_size = get_workspace_size(l);
-    l.activation = activation;
-
-    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d  %5.3f BFLOPs\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.);
-
-    return l;
-}
-
-void denormalize_convolutional_layer(convolutional_layer l)
-{
-    int i, j;
-    for(i = 0; i < l.n; ++i){
-        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
-        for(j = 0; j < l.c/l.groups*l.size*l.size; ++j){
-            l.weights[i*l.c/l.groups*l.size*l.size + j] *= scale;
-        }
-        l.biases[i] -= l.rolling_mean[i] * scale;
-        l.scales[i] = 1;
-        l.rolling_mean[i] = 0;
-        l.rolling_variance[i] = 1;
-    }
-}
-
-/*
-void test_convolutional_layer()
-{
-    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0);
-    l.batch_normalize = 1;
-    float data[] = {1,1,1,1,1,
-        1,1,1,1,1,
-        1,1,1,1,1,
-        1,1,1,1,1,
-        1,1,1,1,1,
-        2,2,2,2,2,
-        2,2,2,2,2,
-        2,2,2,2,2,
-        2,2,2,2,2,
-        2,2,2,2,2,
-        3,3,3,3,3,
-        3,3,3,3,3,
-        3,3,3,3,3,
-        3,3,3,3,3,
-        3,3,3,3,3};
-    //net.input = data;
-    //forward_convolutional_layer(l);
-}
-*/
-
-void resize_convolutional_layer(convolutional_layer *l, int w, int h)
-{
-    l->w = w;
-    l->h = h;
-    int out_w = convolutional_out_width(*l);
-    int out_h = convolutional_out_height(*l);
-
-    l->out_w = out_w;
-    l->out_h = out_h;
-
-    l->outputs = l->out_h * l->out_w * l->out_c;
-    l->inputs = l->w * l->h * l->c;
-
-    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
-    l->delta  = realloc(l->delta,  l->batch*l->outputs*sizeof(float));
-    if(l->batch_normalize){
-        l->x = realloc(l->x, l->batch*l->outputs*sizeof(float));
-        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
-    }
-
-#ifdef GPU
-    cuda_free(l->delta_gpu);
-    cuda_free(l->output_gpu);
-
-    l->delta_gpu =  cuda_make_array(l->delta,  l->batch*l->outputs);
-    l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
-
-    if(l->batch_normalize){
-        cuda_free(l->x_gpu);
-        cuda_free(l->x_norm_gpu);
-
-        l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
-        l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
-    }
-#ifdef CUDNN
-    cudnn_convolutional_setup(l);
-#endif
-#endif
-    l->workspace_size = get_workspace_size(*l);
-}
-
-void add_bias(float *output, float *biases, int batch, int n, int size)
-{
-    int i,j,b;
-    for(b = 0; b < batch; ++b){
-        for(i = 0; i < n; ++i){
-            for(j = 0; j < size; ++j){
-                output[(b*n + i)*size + j] += biases[i];
-            }
-        }
-    }
-}
-
-void scale_bias(float *output, float *scales, int batch, int n, int size)
-{
-    int i,j,b;
-    for(b = 0; b < batch; ++b){
-        for(i = 0; i < n; ++i){
-            for(j = 0; j < size; ++j){
-                output[(b*n + i)*size + j] *= scales[i];
-            }
-        }
-    }
-}
-
-void backward_bias(float *bias_updates, float *delta, int batch, int n, int size)
-{
-    int i,b;
-    for(b = 0; b < batch; ++b){
-        for(i = 0; i < n; ++i){
-            bias_updates[i] += sum_array(delta+size*(i+b*n), size);
-        }
-    }
-}
-
-void forward_convolutional_layer(convolutional_layer l, network net)
-{
-    int i, j;
-
-    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
-
-    if(l.xnor){
-        binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
-        swap_binary(&l);
-        binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
-        net.input = l.binary_input;
-    }
-
-    int m = l.n/l.groups;
-    int k = l.size*l.size*l.c/l.groups;
-    int n = l.out_w*l.out_h;
-    for(i = 0; i < l.batch; ++i){
-        for(j = 0; j < l.groups; ++j){
-            float *a = l.weights + j*l.nweights/l.groups;
-            float *b = net.workspace;
-            float *c = l.output + (i*l.groups + j)*n*m;
-            float *im =  net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
-
-            if (l.size == 1) {
-                b = im;
-            } else {
-                im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
-            }
-            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-        }
-    }
-
-    if(l.batch_normalize){
-        forward_batchnorm_layer(l, net);
-    } else {
-        add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
-    }
-
-    activate_array(l.output, l.outputs*l.batch, l.activation);
-    if(l.binary || l.xnor) swap_binary(&l);
-}
-
-void backward_convolutional_layer(convolutional_layer l, network net)
-{
-    int i, j;
-    int m = l.n/l.groups;
-    int n = l.size*l.size*l.c/l.groups;
-    int k = l.out_w*l.out_h;
-
-    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-
-    if(l.batch_normalize){
-        backward_batchnorm_layer(l, net);
-    } else {
-        backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
-    }
-
-    for(i = 0; i < l.batch; ++i){
-        for(j = 0; j < l.groups; ++j){
-            float *a = l.delta + (i*l.groups + j)*m*k;
-            float *b = net.workspace;
-            float *c = l.weight_updates + j*l.nweights/l.groups;
-
-            float *im  = net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
-            float *imd = net.delta + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
-
-            if(l.size == 1){
-                b = im;
-            } else {
-                im2col_cpu(im, l.c/l.groups, l.h, l.w, 
-                        l.size, l.stride, l.pad, b);
-            }
-
-            gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
-
-            if (net.delta) {
-                a = l.weights + j*l.nweights/l.groups;
-                b = l.delta + (i*l.groups + j)*m*k;
-                c = net.workspace;
-                if (l.size == 1) {
-                    c = imd;
-                }
-
-                gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
-
-                if (l.size != 1) {
-                    col2im_cpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
-                }
-            }
-        }
-    }
-}
-
-void update_convolutional_layer(convolutional_layer l, update_args a)
-{
-    float learning_rate = a.learning_rate*l.learning_rate_scale;
-    float momentum = a.momentum;
-    float decay = a.decay;
-    int batch = a.batch;
-
-    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
-    scal_cpu(l.n, momentum, l.bias_updates, 1);
-
-    if(l.scales){
-        axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
-        scal_cpu(l.n, momentum, l.scale_updates, 1);
-    }
-
-    axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(l.nweights, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
-    scal_cpu(l.nweights, momentum, l.weight_updates, 1);
-}
-
-
-image get_convolutional_weight(convolutional_layer l, int i)
-{
-    int h = l.size;
-    int w = l.size;
-    int c = l.c/l.groups;
-    return float_to_image(w,h,c,l.weights+i*h*w*c);
-}
-
-void rgbgr_weights(convolutional_layer l)
-{
-    int i;
-    for(i = 0; i < l.n; ++i){
-        image im = get_convolutional_weight(l, i);
-        if (im.c == 3) {
-            rgbgr_image(im);
-        }
-    }
-}
-
-void rescale_weights(convolutional_layer l, float scale, float trans)
-{
-    int i;
-    for(i = 0; i < l.n; ++i){
-        image im = get_convolutional_weight(l, i);
-        if (im.c == 3) {
-            scale_image(im, scale);
-            float sum = sum_array(im.data, im.w*im.h*im.c);
-            l.biases[i] += sum*trans;
-        }
-    }
-}
-
-image *get_weights(convolutional_layer l)
-{
-    image *weights = calloc(l.n, sizeof(image));
-    int i;
-    for(i = 0; i < l.n; ++i){
-        weights[i] = copy_image(get_convolutional_weight(l, i));
-        normalize_image(weights[i]);
-        /*
-           char buff[256];
-           sprintf(buff, "filter%d", i);
-           save_image(weights[i], buff);
-         */
-    }
-    //error("hey");
-    return weights;
-}
-
-image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights)
-{
-    image *single_weights = get_weights(l);
-    show_images(single_weights, l.n, window);
-
-    image delta = get_convolutional_image(l);
-    image dc = collapse_image_layers(delta, 1);
-    char buff[256];
-    sprintf(buff, "%s: Output", window);
-    //show_image(dc, buff);
-    //save_image(dc, buff);
-    free_image(dc);
-    return single_weights;
-}
-
--- a/src/convolutional_layer.cpp
+++ b/src/convolutional_layer.cpp
@ -0,0 +1,604 @@
+#include "convolutional_layer.h"
+#include "utils.h"
+#include "batchnorm_layer.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "blas.h"
+#include "gemm.h"
+#include <stdio.h>
+#include <time.h>
+
+#ifdef AI2
+#include "xnor_layer.h"
+#endif
+
+void swap_binary(convolutional_layer *l) {
+    float *swap = l->weights;
+    l->weights = l->binary_weights;
+    l->binary_weights = swap;
+
+#ifdef GPU
+    swap = l->weights_gpu;
+    l->weights_gpu = l->binary_weights_gpu;
+    l->binary_weights_gpu = swap;
+#endif
+}
+
+void binarize_weights(float *weights, int n, int size, float *binary) {
+    int i, f;
+    for (f = 0; f < n; ++f) {
+        float mean = 0;
+        for (i = 0; i < size; ++i) {
+            mean += fabs(weights[f * size + i]);
+        }
+        mean = mean / size;
+        for (i = 0; i < size; ++i) {
+            binary[f * size + i] = (weights[f * size + i] > 0) ? mean : -mean;
+        }
+    }
+}
+
+void binarize_cpu(float *input, int n, float *binary) {
+    int i;
+    for (i = 0; i < n; ++i) {
+        binary[i] = (input[i] > 0) ? 1 : -1;
+    }
+}
+
+void binarize_input(float *input, int n, int size, float *binary) {
+    int i, s;
+    for (s = 0; s < size; ++s) {
+        float mean = 0;
+        for (i = 0; i < n; ++i) {
+            mean += fabs(input[i * size + s]);
+        }
+        mean = mean / n;
+        for (i = 0; i < n; ++i) {
+            binary[i * size + s] = (input[i * size + s] > 0) ? mean : -mean;
+        }
+    }
+}
+
+int convolutional_out_height(convolutional_layer l) {
+    return (l.h + 2 * l.pad - l.size) / l.stride + 1;
+}
+
+int convolutional_out_width(convolutional_layer l) {
+    return (l.w + 2 * l.pad - l.size) / l.stride + 1;
+}
+
+image get_convolutional_image(convolutional_layer l) {
+    return float_to_image(l.out_w, l.out_h, l.out_c, l.output);
+}
+
+image get_convolutional_delta(convolutional_layer l) {
+    return float_to_image(l.out_w, l.out_h, l.out_c, l.delta);
+}
+
+static size_t get_workspace_size(layer l) {
+#ifdef CUDNN
+    if(gpu_index >= 0){
+        size_t most = 0;
+        size_t s = 0;
+        hipdnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
+                l.srcTensorDesc,
+                l.weightDesc,
+                l.convDesc,
+                l.dstTensorDesc,
+                l.fw_algo,
+                &s);
+        if (s > most) most = s;
+        hipdnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
+                l.srcTensorDesc,
+                l.ddstTensorDesc,
+                l.convDesc,
+                l.dweightDesc,
+                l.bf_algo,
+                &s);
+        if (s > most) most = s;
+        hipdnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
+                l.weightDesc,
+                l.ddstTensorDesc,
+                l.convDesc,
+                l.dsrcTensorDesc,
+                l.bd_algo,
+                &s);
+        if (s > most) most = s;
+        return most;
+    }
+#endif
+    return (size_t) l.out_h * l.out_w * l.size * l.size * l.c / l.groups * sizeof(float);
+}
+
+#ifdef GPU
+#ifdef CUDNN
+void cudnn_convolutional_setup(layer *l)
+{
+    hipdnnSetTensor4dDescriptor(l->dsrcTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
+    hipdnnSetTensor4dDescriptor(l->ddstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
+
+    hipdnnSetTensor4dDescriptor(l->srcTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
+    hipdnnSetTensor4dDescriptor(l->dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
+    hipdnnSetTensor4dDescriptor(l->normTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
+
+    hipdnnSetFilter4dDescriptor(l->dweightDesc, HIPDNN_DATA_FLOAT, HIPDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
+    hipdnnSetFilter4dDescriptor(l->weightDesc, HIPDNN_DATA_FLOAT, HIPDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
+#if CUDNN_MAJOR >= 6
+    hipdnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, HIPDNN_CROSS_CORRELATION, HIPDNN_DATA_FLOAT);
+#else
+    hipdnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, HIPDNN_CROSS_CORRELATION);
+#endif
+
+#if CUDNN_MAJOR >= 7
+    hipdnnSetConvolutionGroupCount(l->convDesc, l->groups);
+#else
+    if(l->groups > 1){
+        error("CUDNN < 7 doesn't support groups, please upgrade!");
+    }
+#endif
+
+    hipdnnGetConvolutionForwardAlgorithm(cudnn_handle(),
+            l->srcTensorDesc,
+            l->weightDesc,
+            l->convDesc,
+            l->dstTensorDesc,
+            HIPDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
+            &l->fw_algo);
+    hipdnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
+            l->weightDesc,
+            l->ddstTensorDesc,
+            l->convDesc,
+            l->dsrcTensorDesc,
+            HIPDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
+            &l->bd_algo);
+    hipdnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
+            l->srcTensorDesc,
+            l->ddstTensorDesc,
+            l->convDesc,
+            l->dweightDesc,
+            HIPDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
+            &l->bf_algo);
+}
+#endif
+#endif
+
+convolutional_layer
+make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding,
+                         ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam) {
+    int i;
+    convolutional_layer l = {(LAYER_TYPE) 0};
+    l.type = CONVOLUTIONAL;
+
+    l.groups = groups;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.n = n;
+    l.binary = binary;
+    l.xnor = xnor;
+    l.batch = batch;
+    l.stride = stride;
+    l.size = size;
+    l.pad = padding;
+    l.batch_normalize = batch_normalize;
+
+    l.weights = (float *) calloc(c / groups * n * size * size, sizeof(float));
+    l.weight_updates = (float *) calloc(c / groups * n * size * size, sizeof(float));
+
+    l.biases = (float *) calloc(n, sizeof(float));
+    l.bias_updates = (float *) calloc(n, sizeof(float));
+
+    l.nweights = c / groups * n * size * size;
+    l.nbiases = n;
+
+    // float scale = 1./sqrt(size*size*c);
+    float scale = sqrt(2. / (size * size * c / l.groups));
+    //printf("convscale %f\n", scale);
+    //scale = .02;
+    //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
+    for (i = 0; i < l.nweights; ++i) l.weights[i] = scale * rand_normal();
+    int out_w = convolutional_out_width(l);
+    int out_h = convolutional_out_height(l);
+    l.out_h = out_h;
+    l.out_w = out_w;
+    l.out_c = n;
+    l.outputs = l.out_h * l.out_w * l.out_c;
+    l.inputs = l.w * l.h * l.c;
+
+    l.output = (float *) calloc(l.batch * l.outputs, sizeof(float));
+    l.delta = (float *) calloc(l.batch * l.outputs, sizeof(float));
+
+    l.forward = forward_convolutional_layer;
+    l.backward = backward_convolutional_layer;
+    l.update = update_convolutional_layer;
+    if (binary) {
+        l.binary_weights = (float *) calloc(l.nweights, sizeof(float));
+        l.cweights = (char *) calloc(l.nweights, sizeof(char));
+        l.scales = (float *) calloc(n, sizeof(float));
+    }
+    if (xnor) {
+        l.binary_weights = (float *) calloc(l.nweights, sizeof(float));
+        l.binary_input = (float *) calloc(l.inputs * l.batch, sizeof(float));
+    }
+
+    if (batch_normalize) {
+        l.scales = (float *) calloc(n, sizeof(float));
+        l.scale_updates = (float *) calloc(n, sizeof(float));
+        for (i = 0; i < n; ++i) {
+            l.scales[i] = 1;
+        }
+
+        l.mean = (float *) calloc(n, sizeof(float));
+        l.variance = (float *) calloc(n, sizeof(float));
+
+        l.mean_delta = (float *) calloc(n, sizeof(float));
+        l.variance_delta = (float *) calloc(n, sizeof(float));
+
+        l.rolling_mean = (float *) calloc(n, sizeof(float));
+        l.rolling_variance = (float *) calloc(n, sizeof(float));
+        l.x = (float *) calloc(l.batch * l.outputs, sizeof(float));
+        l.x_norm = (float *) calloc(l.batch * l.outputs, sizeof(float));
+    }
+    if (adam) {
+        l.m = (float *) calloc(l.nweights, sizeof(float));
+        l.v = (float *) calloc(l.nweights, sizeof(float));
+        l.bias_m = (float *) calloc(n, sizeof(float));
+        l.scale_m = (float *) calloc(n, sizeof(float));
+        l.bias_v = (float *) calloc(n, sizeof(float));
+        l.scale_v = (float *) calloc(n, sizeof(float));
+    }
+
+#ifdef GPU
+    l.forward_gpu = forward_convolutional_layer_gpu;
+    l.backward_gpu = backward_convolutional_layer_gpu;
+    l.update_gpu = update_convolutional_layer_gpu;
+
+    if(gpu_index >= 0){
+        if (adam) {
+            l.m_gpu = cuda_make_array(l.m, l.nweights);
+            l.v_gpu = cuda_make_array(l.v, l.nweights);
+            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
+            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
+            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
+            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
+        }
+
+        l.weights_gpu = cuda_make_array(l.weights, l.nweights);
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
+
+        l.biases_gpu = cuda_make_array(l.biases, n);
+        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+
+        l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
+        l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+
+        if(binary){
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
+        }
+        if(xnor){
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
+            l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
+        }
+
+        if(batch_normalize){
+            l.mean_gpu = cuda_make_array(l.mean, n);
+            l.variance_gpu = cuda_make_array(l.variance, n);
+
+            l.rolling_mean_gpu = cuda_make_array(l.mean, n);
+            l.rolling_variance_gpu = cuda_make_array(l.variance, n);
+
+            l.mean_delta_gpu = cuda_make_array(l.mean, n);
+            l.variance_delta_gpu = cuda_make_array(l.variance, n);
+
+            l.scales_gpu = cuda_make_array(l.scales, n);
+            l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
+
+            l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+        }
+#ifdef CUDNN
+        hipdnnCreateTensorDescriptor(&l.normTensorDesc);
+        hipdnnCreateTensorDescriptor(&l.srcTensorDesc);
+        hipdnnCreateTensorDescriptor(&l.dstTensorDesc);
+        hipdnnCreateFilterDescriptor(&l.weightDesc);
+        hipdnnCreateTensorDescriptor(&l.dsrcTensorDesc);
+        hipdnnCreateTensorDescriptor(&l.ddstTensorDesc);
+        hipdnnCreateFilterDescriptor(&l.dweightDesc);
+        hipdnnCreateConvolutionDescriptor(&l.convDesc);
+        cudnn_convolutional_setup(&l);
+#endif
+    }
+#endif
+    l.workspace_size = get_workspace_size(l);
+    l.activation = activation;
+
+    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d  %5.3f BFLOPs\n", n, size, size, stride,
+            w, h, c, l.out_w, l.out_h, l.out_c,
+            (2.0 * l.n * l.size * l.size * l.c / l.groups * l.out_h * l.out_w) / 1000000000.);
+
+    return l;
+}
+
+void denormalize_convolutional_layer(convolutional_layer l) {
+    int i, j;
+    for (i = 0; i < l.n; ++i) {
+        float scale = l.scales[i] / sqrt(l.rolling_variance[i] + .00001);
+        for (j = 0; j < l.c / l.groups * l.size * l.size; ++j) {
+            l.weights[i * l.c / l.groups * l.size * l.size + j] *= scale;
+        }
+        l.biases[i] -= l.rolling_mean[i] * scale;
+        l.scales[i] = 1;
+        l.rolling_mean[i] = 0;
+        l.rolling_variance[i] = 1;
+    }
+}
+
+/*
+void test_convolutional_layer()
+{
+    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0);
+    l.batch_normalize = 1;
+    float data[] = {1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3};
+    //net.input = data;
+    //forward_convolutional_layer(l);
+}
+*/
+
+void resize_convolutional_layer(convolutional_layer *l, int w, int h) {
+    l->w = w;
+    l->h = h;
+    int out_w = convolutional_out_width(*l);
+    int out_h = convolutional_out_height(*l);
+
+    l->out_w = out_w;
+    l->out_h = out_h;
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->w * l->h * l->c;
+
+    l->output = (float *) realloc(l->output, l->batch * l->outputs * sizeof(float));
+    l->delta = (float *) realloc(l->delta, l->batch * l->outputs * sizeof(float));
+    if (l->batch_normalize) {
+        l->x = (float *) realloc(l->x, l->batch * l->outputs * sizeof(float));
+        l->x_norm = (float *) realloc(l->x_norm, l->batch * l->outputs * sizeof(float));
+    }
+
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =  cuda_make_array(l->delta,  l->batch*l->outputs);
+    l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+
+    if(l->batch_normalize){
+        cuda_free(l->x_gpu);
+        cuda_free(l->x_norm_gpu);
+
+        l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+        l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+    }
+#ifdef CUDNN
+    cudnn_convolutional_setup(l);
+#endif
+#endif
+    l->workspace_size = get_workspace_size(*l);
+}
+
+void add_bias(float *output, float *biases, int batch, int n, int size) {
+    int i, j, b;
+    for (b = 0; b < batch; ++b) {
+        for (i = 0; i < n; ++i) {
+            for (j = 0; j < size; ++j) {
+                output[(b * n + i) * size + j] += biases[i];
+            }
+        }
+    }
+}
+
+void scale_bias(float *output, float *scales, int batch, int n, int size) {
+    int i, j, b;
+    for (b = 0; b < batch; ++b) {
+        for (i = 0; i < n; ++i) {
+            for (j = 0; j < size; ++j) {
+                output[(b * n + i) * size + j] *= scales[i];
+            }
+        }
+    }
+}
+
+void backward_bias(float *bias_updates, float *delta, int batch, int n, int size) {
+    int i, b;
+    for (b = 0; b < batch; ++b) {
+        for (i = 0; i < n; ++i) {
+            bias_updates[i] += sum_array(delta + size * (i + b * n), size);
+        }
+    }
+}
+
+void forward_convolutional_layer(convolutional_layer l, network net) {
+    int i, j;
+
+    fill_cpu(l.outputs * l.batch, 0, l.output, 1);
+
+    if (l.xnor) {
+        binarize_weights(l.weights, l.n, l.c / l.groups * l.size * l.size, l.binary_weights);
+        swap_binary(&l);
+        binarize_cpu(net.input, l.c * l.h * l.w * l.batch, l.binary_input);
+        net.input = l.binary_input;
+    }
+
+    int m = l.n / l.groups;
+    int k = l.size * l.size * l.c / l.groups;
+    int n = l.out_w * l.out_h;
+    for (i = 0; i < l.batch; ++i) {
+        for (j = 0; j < l.groups; ++j) {
+            float *a = l.weights + j * l.nweights / l.groups;
+            float *b = net.workspace;
+            float *c = l.output + (i * l.groups + j) * n * m;
+            float *im = net.input + (i * l.groups + j) * l.c / l.groups * l.h * l.w;
+
+            if (l.size == 1) {
+                b = im;
+            } else {
+                im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            }
+            gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
+        }
+    }
+
+    if (l.batch_normalize) {
+        forward_batchnorm_layer(l, net);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.n, l.out_h * l.out_w);
+    }
+
+    activate_array(l.output, l.outputs * l.batch, l.activation);
+    if (l.binary || l.xnor) swap_binary(&l);
+}
+
+void backward_convolutional_layer(convolutional_layer l, network net) {
+    int i, j;
+    int m = l.n / l.groups;
+    int n = l.size * l.size * l.c / l.groups;
+    int k = l.out_w * l.out_h;
+
+    gradient_array(l.output, l.outputs * l.batch, l.activation, l.delta);
+
+    if (l.batch_normalize) {
+        backward_batchnorm_layer(l, net);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
+    }
+
+    for (i = 0; i < l.batch; ++i) {
+        for (j = 0; j < l.groups; ++j) {
+            float *a = l.delta + (i * l.groups + j) * m * k;
+            float *b = net.workspace;
+            float *c = l.weight_updates + j * l.nweights / l.groups;
+
+            float *im = net.input + (i * l.groups + j) * l.c / l.groups * l.h * l.w;
+            float *imd = net.delta + (i * l.groups + j) * l.c / l.groups * l.h * l.w;
+
+            if (l.size == 1) {
+                b = im;
+            } else {
+                im2col_cpu(im, l.c / l.groups, l.h, l.w,
+                           l.size, l.stride, l.pad, b);
+            }
+
+            gemm(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
+
+            if (net.delta) {
+                a = l.weights + j * l.nweights / l.groups;
+                b = l.delta + (i * l.groups + j) * m * k;
+                c = net.workspace;
+                if (l.size == 1) {
+                    c = imd;
+                }
+
+                gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
+
+                if (l.size != 1) {
+                    col2im_cpu(net.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
+                }
+            }
+        }
+    }
+}
+
+void update_convolutional_layer(convolutional_layer l, update_args a) {
+    float learning_rate = a.learning_rate * l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
+    axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.n, momentum, l.bias_updates, 1);
+
+    if (l.scales) {
+        axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.n, momentum, l.scale_updates, 1);
+    }
+
+    axpy_cpu(l.nweights, -decay * batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(l.nweights, momentum, l.weight_updates, 1);
+}
+
+
+image get_convolutional_weight(convolutional_layer l, int i) {
+    int h = l.size;
+    int w = l.size;
+    int c = l.c / l.groups;
+    return float_to_image(w, h, c, l.weights + i * h * w * c);
+}
+
+void rgbgr_weights(convolutional_layer l) {
+    int i;
+    for (i = 0; i < l.n; ++i) {
+        image im = get_convolutional_weight(l, i);
+        if (im.c == 3) {
+            rgbgr_image(im);
+        }
+    }
+}
+
+void rescale_weights(convolutional_layer l, float scale, float trans) {
+    int i;
+    for (i = 0; i < l.n; ++i) {
+        image im = get_convolutional_weight(l, i);
+        if (im.c == 3) {
+            scale_image(im, scale);
+            float sum = sum_array(im.data, im.w * im.h * im.c);
+            l.biases[i] += sum * trans;
+        }
+    }
+}
+
+image *get_weights(convolutional_layer l) {
+    image *weights = (image *)calloc(l.n, sizeof(image));
+    int i;
+    for (i = 0; i < l.n; ++i) {
+        weights[i] = copy_image(get_convolutional_weight(l, i));
+        normalize_image(weights[i]);
+        /*
+           char buff[256];
+           sprintf(buff, "filter%d", i);
+           save_image(weights[i], buff);
+         */
+    }
+    //error("hey");
+    return weights;
+}
+
+image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights) {
+    image *single_weights = get_weights(l);
+    show_images(single_weights, l.n, window);
+
+    image delta = get_convolutional_image(l);
+    image dc = collapse_image_layers(delta, 1);
+    char buff[256];
+    sprintf(buff, "%s: Output", window);
+    //show_image(dc, buff);
+    //save_image(dc, buff);
+    free_image(dc);
+    return single_weights;
+}
+
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@ -1,7 +1,6 @@
 #ifndef CONVOLUTIONAL_LAYER_H
 #define CONVOLUTIONAL_LAYER_H

-#include "cuda.h"
 #include "image.h"
 #include "activations.h"
 #include "layer.h"
@ -10,6 +9,9 @@
 typedef layer convolutional_layer;

 #ifdef GPU
+#include "cuda.h"
+#include "hip/hip_runtime.h"
+
 void forward_convolutional_layer_gpu(convolutional_layer layer, network net);
 void backward_convolutional_layer_gpu(convolutional_layer layer, network net);
 void update_convolutional_layer_gpu(convolutional_layer layer, update_args a);
@ -25,25 +27,38 @@ void cudnn_convolutional_setup(layer *l);
 #endif
 #endif

-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
+convolutional_layer
+make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding,
+                         ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
+
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
+
 void forward_convolutional_layer(const convolutional_layer layer, network net);
+
 void update_convolutional_layer(convolutional_layer layer, update_args a);
+
 image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights);
+
 void binarize_weights(float *weights, int n, int size, float *binary);
+
 void swap_binary(convolutional_layer *l);
+
 void binarize_weights2(float *weights, int n, int size, char *binary, float *scales);

 void backward_convolutional_layer(convolutional_layer layer, network net);

 void add_bias(float *output, float *biases, int batch, int n, int size);
+
 void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);

 image get_convolutional_image(convolutional_layer layer);
+
 image get_convolutional_delta(convolutional_layer layer);
+
 image get_convolutional_weight(convolutional_layer layer, int i);

 int convolutional_out_height(convolutional_layer layer);
+
 int convolutional_out_width(convolutional_layer layer);

 #endif
--- a/src/cost_layer.cpp
+++ b/src/cost_layer.cpp
@ -1,27 +1,28 @@
 #include "cost_layer.h"
 #include "utils.h"
-#include "cuda.h"
 #include "blas.h"
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>

-COST_TYPE get_cost_type(char *s)
-{
-    if (strcmp(s, "seg")==0) return SEG;
-    if (strcmp(s, "sse")==0) return SSE;
-    if (strcmp(s, "masked")==0) return MASKED;
-    if (strcmp(s, "smooth")==0) return SMOOTH;
-    if (strcmp(s, "L1")==0) return L1;
-    if (strcmp(s, "wgan")==0) return WGAN;
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+COST_TYPE get_cost_type(char *s) {
+    if (strcmp(s, "seg") == 0) return SEG;
+    if (strcmp(s, "sse") == 0) return SSE;
+    if (strcmp(s, "masked") == 0) return MASKED;
+    if (strcmp(s, "smooth") == 0) return SMOOTH;
+    if (strcmp(s, "L1") == 0) return L1;
+    if (strcmp(s, "wgan") == 0) return WGAN;
    fprintf(stderr, "Couldn't find cost type %s, going with SSE\n", s);
    return SSE;
 }

-char *get_cost_string(COST_TYPE a)
-{
-    switch(a){
+char *get_cost_string(COST_TYPE a) {
+    switch (a) {
        case SEG:
            return "seg";
        case SSE:
@ -38,10 +39,9 @@ char *get_cost_string(COST_TYPE a)
    return "sse";
 }

-cost_layer make_cost_layer(int batch, int inputs, COST_TYPE cost_type, float scale)
-{
-    fprintf(stderr, "cost                                           %4d\n",  inputs);
-    cost_layer l = {0};
+cost_layer make_cost_layer(int batch, int inputs, COST_TYPE cost_type, float scale) {
+    fprintf(stderr, "cost                                           %4d\n", inputs);
+    cost_layer l = {(LAYER_TYPE)0};
    l.type = COST;

    l.scale = scale;
@ -49,28 +49,27 @@ cost_layer make_cost_layer(int batch, int inputs, COST_TYPE cost_type, float sca
    l.inputs = inputs;
    l.outputs = inputs;
    l.cost_type = cost_type;
-    l.delta = calloc(inputs*batch, sizeof(float));
-    l.output = calloc(inputs*batch, sizeof(float));
-    l.cost = calloc(1, sizeof(float));
+    l.delta = (float *) calloc(inputs * batch, sizeof(float));
+    l.output = (float *) calloc(inputs * batch, sizeof(float));
+    l.cost = (float *) calloc(1, sizeof(float));

    l.forward = forward_cost_layer;
    l.backward = backward_cost_layer;
-    #ifdef GPU
+#ifdef GPU
    l.forward_gpu = forward_cost_layer_gpu;
    l.backward_gpu = backward_cost_layer_gpu;

    l.delta_gpu = cuda_make_array(l.output, inputs*batch);
    l.output_gpu = cuda_make_array(l.delta, inputs*batch);
-    #endif
+#endif
    return l;
 }

-void resize_cost_layer(cost_layer *l, int inputs)
-{
+void resize_cost_layer(cost_layer *l, int inputs) {
    l->inputs = inputs;
    l->outputs = inputs;
-    l->delta = realloc(l->delta, inputs*l->batch*sizeof(float));
-    l->output = realloc(l->output, inputs*l->batch*sizeof(float));
+    l->delta = (float *) realloc(l->delta, inputs * l->batch * sizeof(float));
+    l->output = (float *) realloc(l->output, inputs * l->batch * sizeof(float));
 #ifdef GPU
    cuda_free(l->delta_gpu);
    cuda_free(l->output_gpu);
@ -79,28 +78,26 @@ void resize_cost_layer(cost_layer *l, int inputs)
 #endif
 }

-void forward_cost_layer(cost_layer l, network net)
-{
+void forward_cost_layer(cost_layer l, network net) {
    if (!net.truth) return;
-    if(l.cost_type == MASKED){
+    if (l.cost_type == MASKED) {
        int i;
-        for(i = 0; i < l.batch*l.inputs; ++i){
-            if(net.truth[i] == SECRET_NUM) net.input[i] = SECRET_NUM;
+        for (i = 0; i < l.batch * l.inputs; ++i) {
+            if (net.truth[i] == SECRET_NUM) net.input[i] = SECRET_NUM;
        }
    }
-    if(l.cost_type == SMOOTH){
-        smooth_l1_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
-    }else if(l.cost_type == L1){
-        l1_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
+    if (l.cost_type == SMOOTH) {
+        smooth_l1_cpu(l.batch * l.inputs, net.input, net.truth, l.delta, l.output);
+    } else if (l.cost_type == L1) {
+        l1_cpu(l.batch * l.inputs, net.input, net.truth, l.delta, l.output);
    } else {
-        l2_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
+        l2_cpu(l.batch * l.inputs, net.input, net.truth, l.delta, l.output);
    }
-    l.cost[0] = sum_array(l.output, l.batch*l.inputs);
+    l.cost[0] = sum_array(l.output, l.batch * l.inputs);
 }

-void backward_cost_layer(const cost_layer l, network net)
-{
-    axpy_cpu(l.batch*l.inputs, l.scale, l.delta, 1, net.delta, 1);
+void backward_cost_layer(const cost_layer l, network net) {
+    axpy_cpu(l.batch * l.inputs, l.scale, l.delta, 1, net.delta, 1);
 }

 #ifdef GPU
--- a/src/cost_layer.h
+++ b/src/cost_layer.h
@ -1,15 +1,21 @@
 #ifndef COST_LAYER_H
 #define COST_LAYER_H
+
 #include "layer.h"
 #include "network.h"

 typedef layer cost_layer;

 COST_TYPE get_cost_type(char *s);
+
 char *get_cost_string(COST_TYPE a);
+
 cost_layer make_cost_layer(int batch, int inputs, COST_TYPE type, float scale);
+
 void forward_cost_layer(const cost_layer l, network net);
+
 void backward_cost_layer(const cost_layer l, network net);
+
 void resize_cost_layer(cost_layer *l, int inputs);

 #ifdef GPU
--- a/src/crnn_layer.cpp
+++ b/src/crnn_layer.cpp
@ -1,7 +1,6 @@
 #include "crnn_layer.h"
 #include "convolutional_layer.h"
 #include "utils.h"
-#include "cuda.h"
 #include "blas.h"
 #include "gemm.h"

@ -10,9 +9,12 @@
 #include <stdlib.h>
 #include <string.h>

-static void increment_layer(layer *l, int steps)
-{
-    int num = l->outputs*l->batch*steps;
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+static void increment_layer(layer *l, int steps) {
+    int num = l->outputs * l->batch * steps;
    l->output += num;
    l->delta += num;
    l->x += num;
@ -26,11 +28,11 @@ static void increment_layer(layer *l, int steps)
 #endif
 }

-layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize)
-{
-    fprintf(stderr, "CRNN Layer: %d x %d x %d image, %d filters\n", h,w,c,output_filters);
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps,
+                      ACTIVATION activation, int batch_normalize) {
+    fprintf(stderr, "CRNN Layer: %d x %d x %d image, %d filters\n", h, w, c, output_filters);
    batch = batch / steps;
-    layer l = {0};
+    layer l = {(LAYER_TYPE) 0};
    l.batch = batch;
    l.type = CRNN;
    l.steps = steps;
@ -40,25 +42,28 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
    l.out_h = h;
    l.out_w = w;
    l.out_c = output_filters;
-    l.inputs = h*w*c;
+    l.inputs = h * w * c;
    l.hidden = h * w * hidden_filters;
    l.outputs = l.out_h * l.out_w * l.out_c;

-    l.state = calloc(l.hidden*batch*(steps+1), sizeof(float));
+    l.state = (float *) calloc(l.hidden * batch * (steps + 1), sizeof(float));

-    l.input_layer = malloc(sizeof(layer));
+    l.input_layer = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.input_layer) = make_convolutional_layer(batch * steps, h, w, c, hidden_filters, 1, 3, 1, 1, activation,
+                                                batch_normalize, 0, 0, 0);
    l.input_layer->batch = batch;

-    l.self_layer = malloc(sizeof(layer));
+    l.self_layer = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.self_layer) = make_convolutional_layer(batch * steps, h, w, hidden_filters, hidden_filters, 1, 3, 1, 1,
+                                               activation, batch_normalize, 0, 0, 0);
    l.self_layer->batch = batch;

-    l.output_layer = malloc(sizeof(layer));
+    l.output_layer = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.output_layer) = make_convolutional_layer(batch * steps, h, w, hidden_filters, output_filters, 1, 3, 1, 1,
+                                                 activation, batch_normalize, 0, 0, 0);
    l.output_layer->batch = batch;

    l.output = l.output_layer->output;
@ -81,15 +86,13 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
    return l;
 }

-void update_crnn_layer(layer l, update_args a)
-{
-    update_convolutional_layer(*(l.input_layer),  a);
-    update_convolutional_layer(*(l.self_layer),   a);
+void update_crnn_layer(layer l, update_args a) {
+    update_convolutional_layer(*(l.input_layer), a);
+    update_convolutional_layer(*(l.self_layer), a);
    update_convolutional_layer(*(l.output_layer), a);
 }

-void forward_crnn_layer(layer l, network net)
-{
+void forward_crnn_layer(layer l, network net) {
    network s = net;
    s.train = net.train;
    int i;
@ -100,7 +103,7 @@ void forward_crnn_layer(layer l, network net)
    fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
    fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
    fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
-    if(net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+    if (net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);

    for (i = 0; i < l.steps; ++i) {
        s.input = net.input;
@ -110,10 +113,10 @@ void forward_crnn_layer(layer l, network net)
        forward_convolutional_layer(self_layer, s);

        float *old_state = l.state;
-        if(net.train) l.state += l.hidden*l.batch;
-        if(l.shortcut){
+        if (net.train) l.state += l.hidden * l.batch;
+        if (l.shortcut) {
            copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
-        }else{
+        } else {
            fill_cpu(l.hidden * l.batch, 0, l.state, 1);
        }
        axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1);
@ -122,27 +125,26 @@ void forward_crnn_layer(layer l, network net)
        s.input = l.state;
        forward_convolutional_layer(output_layer, s);

-        net.input += l.inputs*l.batch;
+        net.input += l.inputs * l.batch;
        increment_layer(&input_layer, 1);
        increment_layer(&self_layer, 1);
        increment_layer(&output_layer, 1);
    }
 }

-void backward_crnn_layer(layer l, network net)
-{
+void backward_crnn_layer(layer l, network net) {
    network s = net;
    int i;
    layer input_layer = *(l.input_layer);
    layer self_layer = *(l.self_layer);
    layer output_layer = *(l.output_layer);

-    increment_layer(&input_layer, l.steps-1);
-    increment_layer(&self_layer, l.steps-1);
-    increment_layer(&output_layer, l.steps-1);
+    increment_layer(&input_layer, l.steps - 1);
+    increment_layer(&self_layer, l.steps - 1);
+    increment_layer(&output_layer, l.steps - 1);

-    l.state += l.hidden*l.batch*l.steps;
-    for (i = l.steps-1; i >= 0; --i) {
+    l.state += l.hidden * l.batch * l.steps;
+    for (i = l.steps - 1; i >= 0; --i) {
        copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1);
        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);

@ -150,7 +152,7 @@ void backward_crnn_layer(layer l, network net)
        s.delta = self_layer.delta;
        backward_convolutional_layer(output_layer, s);

-        l.state -= l.hidden*l.batch;
+        l.state -= l.hidden * l.batch;
        /*
           if(i > 0){
           copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1);
@ -161,14 +163,15 @@ void backward_crnn_layer(layer l, network net)
         */

        s.input = l.state;
-        s.delta = self_layer.delta - l.hidden*l.batch;
+        s.delta = self_layer.delta - l.hidden * l.batch;
        if (i == 0) s.delta = 0;
        backward_convolutional_layer(self_layer, s);

-        copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
-        if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
-        s.input = net.input + i*l.inputs*l.batch;
-        if(net.delta) s.delta = net.delta + i*l.inputs*l.batch;
+        copy_cpu(l.hidden * l.batch, self_layer.delta, 1, input_layer.delta, 1);
+        if (i > 0 && l.shortcut)
+            axpy_cpu(l.hidden * l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden * l.batch, 1);
+        s.input = net.input + i * l.inputs * l.batch;
+        if (net.delta) s.delta = net.delta + i * l.inputs * l.batch;
        else s.delta = 0;
        backward_convolutional_layer(input_layer, s);

--- a/src/crnn_layer.h
+++ b/src/crnn_layer.h
@ -1,4 +1,3 @@
-
 #ifndef CRNN_LAYER_H
 #define CRNN_LAYER_H

@ -6,10 +5,13 @@
 #include "layer.h"
 #include "network.h"

-layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps,
+                      ACTIVATION activation, int batch_normalize);

 void forward_crnn_layer(layer l, network net);
+
 void backward_crnn_layer(layer l, network net);
+
 void update_crnn_layer(layer l, update_args a);

 #ifdef GPU
--- a/src/crop_layer.c
+++ b/src/crop_layer.c
@ -1,103 +0,0 @@
-#include "crop_layer.h"
-#include "cuda.h"
-#include <stdio.h>
-
-image get_crop_image(crop_layer l)
-{
-    int h = l.out_h;
-    int w = l.out_w;
-    int c = l.out_c;
-    return float_to_image(w,h,c,l.output);
-}
-
-void backward_crop_layer(const crop_layer l, network net){}
-void backward_crop_layer_gpu(const crop_layer l, network net){}
-
-crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure)
-{
-    fprintf(stderr, "Crop Layer: %d x %d -> %d x %d x %d image\n", h,w,crop_height,crop_width,c);
-    crop_layer l = {0};
-    l.type = CROP;
-    l.batch = batch;
-    l.h = h;
-    l.w = w;
-    l.c = c;
-    l.scale = (float)crop_height / h;
-    l.flip = flip;
-    l.angle = angle;
-    l.saturation = saturation;
-    l.exposure = exposure;
-    l.out_w = crop_width;
-    l.out_h = crop_height;
-    l.out_c = c;
-    l.inputs = l.w * l.h * l.c;
-    l.outputs = l.out_w * l.out_h * l.out_c;
-    l.output = calloc(l.outputs*batch, sizeof(float));
-    l.forward = forward_crop_layer;
-    l.backward = backward_crop_layer;
-
-    #ifdef GPU
-    l.forward_gpu = forward_crop_layer_gpu;
-    l.backward_gpu = backward_crop_layer_gpu;
-    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
-    l.rand_gpu   = cuda_make_array(0, l.batch*8);
-    #endif
-    return l;
-}
-
-void resize_crop_layer(layer *l, int w, int h)
-{
-    l->w = w;
-    l->h = h;
-
-    l->out_w =  l->scale*w;
-    l->out_h =  l->scale*h;
-
-    l->inputs = l->w * l->h * l->c;
-    l->outputs = l->out_h * l->out_w * l->out_c;
-
-    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
-    #ifdef GPU
-    cuda_free(l->output_gpu);
-    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
-    #endif
-}
-
-
-void forward_crop_layer(const crop_layer l, network net)
-{
-    int i,j,c,b,row,col;
-    int index;
-    int count = 0;
-    int flip = (l.flip && rand()%2);
-    int dh = rand()%(l.h - l.out_h + 1);
-    int dw = rand()%(l.w - l.out_w + 1);
-    float scale = 2;
-    float trans = -1;
-    if(l.noadjust){
-        scale = 1;
-        trans = 0;
-    }
-    if(!net.train){
-        flip = 0;
-        dh = (l.h - l.out_h)/2;
-        dw = (l.w - l.out_w)/2;
-    }
-    for(b = 0; b < l.batch; ++b){
-        for(c = 0; c < l.c; ++c){
-            for(i = 0; i < l.out_h; ++i){
-                for(j = 0; j < l.out_w; ++j){
-                    if(flip){
-                        col = l.w - dw - j - 1;    
-                    }else{
-                        col = j + dw;
-                    }
-                    row = i + dh;
-                    index = col+l.w*(row+l.h*(c + l.c*b)); 
-                    l.output[count++] = net.input[index]*scale + trans;
-                }
-            }
-        }
-    }
-}
-
--- a/src/crop_layer.cpp
+++ b/src/crop_layer.cpp
@ -0,0 +1,104 @@
+#include "crop_layer.h"
+#include <stdio.h>
+
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+image get_crop_image(crop_layer l) {
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.out_c;
+    return float_to_image(w, h, c, l.output);
+}
+
+void backward_crop_layer(const crop_layer l, network net) {}
+
+void backward_crop_layer_gpu(const crop_layer l, network net) {}
+
+crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle,
+                           float saturation, float exposure) {
+    fprintf(stderr, "Crop Layer: %d x %d -> %d x %d x %d image\n", h, w, crop_height, crop_width, c);
+    crop_layer l = {(LAYER_TYPE)0};
+    l.type = CROP;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.scale = (float) crop_height / h;
+    l.flip = flip;
+    l.angle = angle;
+    l.saturation = saturation;
+    l.exposure = exposure;
+    l.out_w = crop_width;
+    l.out_h = crop_height;
+    l.out_c = c;
+    l.inputs = l.w * l.h * l.c;
+    l.outputs = l.out_w * l.out_h * l.out_c;
+    l.output = (float*)calloc(l.outputs * batch, sizeof(float));
+    l.forward = forward_crop_layer;
+    l.backward = backward_crop_layer;
+
+#ifdef GPU
+    l.forward_gpu = forward_crop_layer_gpu;
+    l.backward_gpu = backward_crop_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+    l.rand_gpu   = cuda_make_array(0, l.batch*8);
+#endif
+    return l;
+}
+
+void resize_crop_layer(layer *l, int w, int h) {
+    l->w = w;
+    l->h = h;
+
+    l->out_w = l->scale * w;
+    l->out_h = l->scale * h;
+
+    l->inputs = l->w * l->h * l->c;
+    l->outputs = l->out_h * l->out_w * l->out_c;
+
+    l->output = (float *) realloc(l->output, l->batch * l->outputs * sizeof(float));
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
+#endif
+}
+
+
+void forward_crop_layer(const crop_layer l, network net) {
+    int i, j, c, b, row, col;
+    int index;
+    int count = 0;
+    int flip = (l.flip && rand() % 2);
+    int dh = rand() % (l.h - l.out_h + 1);
+    int dw = rand() % (l.w - l.out_w + 1);
+    float scale = 2;
+    float trans = -1;
+    if (l.noadjust) {
+        scale = 1;
+        trans = 0;
+    }
+    if (!net.train) {
+        flip = 0;
+        dh = (l.h - l.out_h) / 2;
+        dw = (l.w - l.out_w) / 2;
+    }
+    for (b = 0; b < l.batch; ++b) {
+        for (c = 0; c < l.c; ++c) {
+            for (i = 0; i < l.out_h; ++i) {
+                for (j = 0; j < l.out_w; ++j) {
+                    if (flip) {
+                        col = l.w - dw - j - 1;
+                    } else {
+                        col = j + dw;
+                    }
+                    row = i + dh;
+                    index = col + l.w * (row + l.h * (c + l.c * b));
+                    l.output[count++] = net.input[index] * scale + trans;
+                }
+            }
+        }
+    }
+}
+
--- a/src/crop_layer.h
+++ b/src/crop_layer.h
@ -8,8 +8,12 @@
 typedef layer crop_layer;

 image get_crop_image(crop_layer l);
-crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure);
+
+crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle,
+                           float saturation, float exposure);
+
 void forward_crop_layer(const crop_layer l, network net);
+
 void resize_crop_layer(layer *l, int w, int h);

 #ifdef GPU
--- a/src/crop_layer_kernels.cu
+++ b/src/crop_layer_kernels.cu
@ -1,105 +1,122 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
+#include "hiprand.h"
+#include "hipblas.h"

-extern "C" {
 #include "crop_layer.h"
 #include "utils.h"
-#include "cuda.h"
 #include "image.h"
+
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+__device__ float get_pixel_kernel(float *image, int w, int h, int x, int y, int c) {
+    if (x < 0 || x >= w || y < 0 || y >= h) return 0;
+    return image[x + w * (y + c * h)];
 }

-__device__ float get_pixel_kernel(float *image, int w, int h, int x, int y, int c)
-{
-    if(x < 0 || x >= w || y < 0 || y >= h) return 0;
-    return image[x + w*(y + c*h)];
+__device__ float3
+rgb_to_hsv_kernel(float3
+rgb) {
+float r = rgb.x;
+float g = rgb.y;
+float b = rgb.z;
+
+float h, s, v;
+float max = (r > g) ? ((r > b) ? r : b) : ((g > b) ? g : b);
+float min = (r < g) ? ((r < b) ? r : b) : ((g < b) ? g : b);
+float delta = max - min;
+v = max;
+if (max == 0) {
+s = 0;
+h = -1;
+} else {
+s = delta / max;
+if (r == max) {
+h = (g - b) / delta;
+} else if (g == max) {
+h = 2 + (b - r) / delta;
+} else {
+h = 4 + (r - g) / delta;
+}
+if (h < 0) h += 6;
+}
+return
+make_float3(h, s, v
+);
 }

-__device__ float3 rgb_to_hsv_kernel(float3 rgb)
-{
-    float r = rgb.x;
-    float g = rgb.y; 
-    float b = rgb.z;
+__device__ float3
+hsv_to_rgb_kernel(float3
+hsv) {
+float h = hsv.x;
+float s = hsv.y;
+float v = hsv.z;

-    float h, s, v;
-    float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b);
-    float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b);
-    float delta = max - min;
-    v = max;
-    if(max == 0){
-        s = 0;
-        h = -1;
-    }else{
-        s = delta/max;
-        if(r == max){
-            h = (g - b) / delta;
-        } else if (g == max) {
-            h = 2 + (b - r) / delta;
-        } else {
-            h = 4 + (r - g) / delta;
-        }
-        if (h < 0) h += 6;
-    }
-    return make_float3(h, s, v);
+float r, g, b;
+float f, p, q, t;
+
+if (s == 0) {
+r = g = b = v;
+} else {
+int index = (int) floorf(h);
+f = h - index;
+p = v * (1 - s);
+q = v * (1 - s * f);
+t = v * (1 - s * (1 - f));
+if (index == 0) {
+r = v;
+g = t;
+b = p;
+} else if (index == 1) {
+r = q;
+g = v;
+b = p;
+} else if (index == 2) {
+r = p;
+g = v;
+b = t;
+} else if (index == 3) {
+r = p;
+g = q;
+b = v;
+} else if (index == 4) {
+r = t;
+g = p;
+b = v;
+} else {
+r = v;
+g = p;
+b = q;
+}
+}
+r = (r < 0) ? 0 : ((r > 1) ? 1 : r);
+g = (g < 0) ? 0 : ((g > 1) ? 1 : g);
+b = (b < 0) ? 0 : ((b > 1) ? 1 : b);
+return
+make_float3(r, g, b
+);
 }

-__device__ float3 hsv_to_rgb_kernel(float3 hsv)
-{
-    float h = hsv.x;
-    float s = hsv.y; 
-    float v = hsv.z;
-
-    float r, g, b;
-    float f, p, q, t;
-
-    if (s == 0) {
-        r = g = b = v;
-    } else {
-        int index = (int) floorf(h);
-        f = h - index;
-        p = v*(1-s);
-        q = v*(1-s*f);
-        t = v*(1-s*(1-f));
-        if(index == 0){
-            r = v; g = t; b = p;
-        } else if(index == 1){
-            r = q; g = v; b = p;
-        } else if(index == 2){
-            r = p; g = v; b = t;
-        } else if(index == 3){
-            r = p; g = q; b = v;
-        } else if(index == 4){
-            r = t; g = p; b = v;
-        } else {
-            r = v; g = p; b = q;
-        }
-    }
-    r = (r < 0) ? 0 : ((r > 1) ? 1 : r);
-    g = (g < 0) ? 0 : ((g > 1) ? 1 : g);
-    b = (b < 0) ? 0 : ((b > 1) ? 1 : b);
-    return make_float3(r, g, b);
-}
-
-__device__ float bilinear_interpolate_kernel(float *image, int w, int h, float x, float y, int c)
-{
+__device__ float bilinear_interpolate_kernel(float *image, int w, int h, float x, float y, int c) {
    int ix = (int) floorf(x);
    int iy = (int) floorf(y);

    float dx = x - ix;
    float dy = y - iy;

-    float val = (1-dy) * (1-dx) * get_pixel_kernel(image, w, h, ix, iy, c) + 
-        dy     * (1-dx) * get_pixel_kernel(image, w, h, ix, iy+1, c) + 
-        (1-dy) *   dx   * get_pixel_kernel(image, w, h, ix+1, iy, c) +
-        dy     *   dx   * get_pixel_kernel(image, w, h, ix+1, iy+1, c);
+    float val = (1 - dy) * (1 - dx) * get_pixel_kernel(image, w, h, ix, iy, c) +
+                dy * (1 - dx) * get_pixel_kernel(image, w, h, ix, iy + 1, c) +
+                (1 - dy) * dx * get_pixel_kernel(image, w, h, ix + 1, iy, c) +
+                dy * dx * get_pixel_kernel(image, w, h, ix + 1, iy + 1, c);
    return val;
 }

-__global__ void levels_image_kernel(float *image, float *rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift)
-{
+__global__ void
+levels_image_kernel(float *image, float *rand, int batch, int w, int h, int train, float saturation, float exposure,
+                    float translate, float scale, float shift) {
    int size = batch * w * h;
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(id >= size) return;
+    int id = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;
    int x = id % w;
    id /= w;
    int y = id % h;
@ -107,23 +124,23 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
    float rshift = rand[0];
    float gshift = rand[1];
    float bshift = rand[2];
-    float r0 = rand[8*id + 0];
-    float r1 = rand[8*id + 1];
-    float r2 = rand[8*id + 2];
-    float r3 = rand[8*id + 3];
+    float r0 = rand[8 * id + 0];
+    float r1 = rand[8 * id + 1];
+    float r2 = rand[8 * id + 2];
+    float r3 = rand[8 * id + 3];

-    saturation = r0*(saturation - 1) + 1;
-    saturation = (r1 > .5f) ? 1.f/saturation : saturation;
-    exposure = r2*(exposure - 1) + 1;
-    exposure = (r3 > .5f) ? 1.f/exposure : exposure;
+    saturation = r0 * (saturation - 1) + 1;
+    saturation = (r1 > .5f) ? 1.f / saturation : saturation;
+    exposure = r2 * (exposure - 1) + 1;
+    exposure = (r3 > .5f) ? 1.f / exposure : exposure;

    size_t offset = id * h * w * 3;
    image += offset;
-    float r = image[x + w*(y + h*0)];
-    float g = image[x + w*(y + h*1)];
-    float b = image[x + w*(y + h*2)];
-    float3 rgb = make_float3(r,g,b);
-    if(train){
+    float r = image[x + w * (y + h * 0)];
+    float g = image[x + w * (y + h * 1)];
+    float b = image[x + w * (y + h * 2)];
+    float3 rgb = make_float3(r, g, b);
+    if (train) {
        float3 hsv = rgb_to_hsv_kernel(rgb);
        hsv.y *= saturation;
        hsv.z *= exposure;
@ -131,18 +148,19 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
    } else {
        shift = 0;
    }
-    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5f)*shift;
-    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5f)*shift;
-    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5f)*shift;
+    image[x + w * (y + h * 0)] = rgb.x * scale + translate + (rshift - .5f) * shift;
+    image[x + w * (y + h * 1)] = rgb.y * scale + translate + (gshift - .5f) * shift;
+    image[x + w * (y + h * 2)] = rgb.z * scale + translate + (bshift - .5f) * shift;
 }

-__global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)
-{
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(id >= size) return;
+__global__ void
+forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width,
+                          int train, int flip, float angle, float *output) {
+    int id = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= size) return;

-    float cx = w/2.f;
-    float cy = h/2.f;
+    float cx = w / 2.f;
+    float cy = h / 2.f;

    int count = id;
    int j = id % crop_width;
@ -153,55 +171,58 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
    id /= c;
    int b = id;

-    float r4 = rand[8*b + 4];
-    float r5 = rand[8*b + 5];
-    float r6 = rand[8*b + 6];
-    float r7 = rand[8*b + 7];
+    float r4 = rand[8 * b + 4];
+    float r5 = rand[8 * b + 5];
+    float r6 = rand[8 * b + 6];
+    float r7 = rand[8 * b + 7];

-    float dw = (w - crop_width)*r4;
-    float dh = (h - crop_height)*r5;
+    float dw = (w - crop_width) * r4;
+    float dh = (h - crop_height) * r5;
    flip = (flip && (r6 > .5f));
-    angle = 2*angle*r7 - angle;
-    if(!train){
-        dw = (w - crop_width)/2.f;
-        dh = (h - crop_height)/2.f;
+    angle = 2 * angle * r7 - angle;
+    if (!train) {
+        dw = (w - crop_width) / 2.f;
+        dh = (h - crop_height) / 2.f;
        flip = 0;
        angle = 0;
    }

-    input += w*h*c*b;
+    input += w * h * c * b;

-    float x = (flip) ? w - dw - j - 1 : j + dw;    
+    float x = (flip) ? w - dw - j - 1 : j + dw;
    float y = i + dh;

-    float rx = cosf(angle)*(x-cx) - sinf(angle)*(y-cy) + cx;
-    float ry = sinf(angle)*(x-cx) + cosf(angle)*(y-cy) + cy;
+    float rx = cosf(angle) * (x - cx) - sinf(angle) * (y - cy) + cx;
+    float ry = sinf(angle) * (x - cx) + cosf(angle) * (y - cy) + cy;

    output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);
 }

-extern "C" void forward_crop_layer_gpu(crop_layer layer, network net)
-{
-    cuda_random(layer.rand_gpu, layer.batch*8);
+void forward_crop_layer_gpu(crop_layer layer, network net) {
+    cuda_random(layer.rand_gpu, layer.batch * 8);

-    float radians = layer.angle*3.14159265f/180.f;
+    float radians = layer.angle * 3.14159265f / 180.f;

    float scale = 2;
    float translate = -1;
-    if(layer.noadjust){
+    if (layer.noadjust) {
        scale = 1;
        translate = 0;
    }

    int size = layer.batch * layer.w * layer.h;

-    levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, layer.batch, layer.w, layer.h, net.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
-    check_error(cudaPeekAtLastError());
+    levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, layer.batch, layer.w, layer.h,
+            net.train, layer.saturation, layer.exposure, translate, scale,
+            layer.shift);
+    check_error(hipPeekAtLastError());

-    size = layer.batch*layer.c*layer.out_w*layer.out_h;
+    size = layer.batch * layer.c * layer.out_w * layer.out_h;

-    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, net.train, layer.flip, radians, layer.output_gpu);
-    check_error(cudaPeekAtLastError());
+    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, size, layer.c, layer.h,
+            layer.w, layer.out_h, layer.out_w, net.train, layer.flip,
+            radians, layer.output_gpu);
+    check_error(hipPeekAtLastError());

 /*
       cuda_pull_array(layer.output_gpu, layer.output, size);
--- a/src/cuda.c
+++ b/src/cuda.c
@ -1,178 +0,0 @@
-int gpu_index = 0;
-
-#ifdef GPU
-
-#include "cuda.h"
-#include "utils.h"
-#include "blas.h"
-#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-
-void cuda_set_device(int n)
-{
-    gpu_index = n;
-    cudaError_t status = cudaSetDevice(n);
-    check_error(status);
-}
-
-int cuda_get_device()
-{
-    int n = 0;
-    cudaError_t status = cudaGetDevice(&n);
-    check_error(status);
-    return n;
-}
-
-void check_error(cudaError_t status)
-{
-    //cudaDeviceSynchronize();
-    cudaError_t status2 = cudaGetLastError();
-    if (status != cudaSuccess)
-    {   
-        const char *s = cudaGetErrorString(status);
-        char buffer[256];
-        printf("CUDA Error: %s\n", s);
-        assert(0);
-        snprintf(buffer, 256, "CUDA Error: %s", s);
-        error(buffer);
-    } 
-    if (status2 != cudaSuccess)
-    {   
-        const char *s = cudaGetErrorString(status);
-        char buffer[256];
-        printf("CUDA Error Prev: %s\n", s);
-        assert(0);
-        snprintf(buffer, 256, "CUDA Error Prev: %s", s);
-        error(buffer);
-    } 
-}
-
-dim3 cuda_gridsize(size_t n){
-    size_t k = (n-1) / BLOCK + 1;
-    size_t x = k;
-    size_t y = 1;
-    if(x > 65535){
-        x = ceil(sqrt(k));
-        y = (n-1)/(x*BLOCK) + 1;
-    }
-    dim3 d = {x, y, 1};
-    //printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK);
-    return d;
-}
-
-#ifdef CUDNN
-cudnnHandle_t cudnn_handle()
-{
-    static int init[16] = {0};
-    static cudnnHandle_t handle[16];
-    int i = cuda_get_device();
-    if(!init[i]) {
-        cudnnCreate(&handle[i]);
-        init[i] = 1;
-    }
-    return handle[i];
-}
-#endif
-
-cublasHandle_t blas_handle()
-{
-    static int init[16] = {0};
-    static cublasHandle_t handle[16];
-    int i = cuda_get_device();
-    if(!init[i]) {
-        cublasCreate(&handle[i]);
-        init[i] = 1;
-    }
-    return handle[i];
-}
-
-float *cuda_make_array(float *x, size_t n)
-{
-    float *x_gpu;
-    size_t size = sizeof(float)*n;
-    cudaError_t status = cudaMalloc((void **)&x_gpu, size);
-    check_error(status);
-    if(x){
-        status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
-        check_error(status);
-    } else {
-        fill_gpu(n, 0, x_gpu, 1);
-    }
-    if(!x_gpu) error("Cuda malloc failed\n");
-    return x_gpu;
-}
-
-void cuda_random(float *x_gpu, size_t n)
-{
-    static curandGenerator_t gen[16];
-    static int init[16] = {0};
-    int i = cuda_get_device();
-    if(!init[i]){
-        curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
-        curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
-        init[i] = 1;
-    }
-    curandGenerateUniform(gen[i], x_gpu, n);
-    check_error(cudaPeekAtLastError());
-}
-
-float cuda_compare(float *x_gpu, float *x, size_t n, char *s)
-{
-    float *tmp = calloc(n, sizeof(float));
-    cuda_pull_array(x_gpu, tmp, n);
-    //int i;
-    //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]);
-    axpy_cpu(n, -1, x, 1, tmp, 1);
-    float err = dot_cpu(n, tmp, 1, tmp, 1);
-    printf("Error %s: %f\n", s, sqrt(err/n));
-    free(tmp);
-    return err;
-}
-
-int *cuda_make_int_array(int *x, size_t n)
-{
-    int *x_gpu;
-    size_t size = sizeof(int)*n;
-    cudaError_t status = cudaMalloc((void **)&x_gpu, size);
-    check_error(status);
-    if(x){
-        status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
-        check_error(status);
-    }
-    if(!x_gpu) error("Cuda malloc failed\n");
-    return x_gpu;
-}
-
-void cuda_free(float *x_gpu)
-{
-    cudaError_t status = cudaFree(x_gpu);
-    check_error(status);
-}
-
-void cuda_push_array(float *x_gpu, float *x, size_t n)
-{
-    size_t size = sizeof(float)*n;
-    cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
-    check_error(status);
-}
-
-void cuda_pull_array(float *x_gpu, float *x, size_t n)
-{
-    size_t size = sizeof(float)*n;
-    cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost);
-    check_error(status);
-}
-
-float cuda_mag_array(float *x_gpu, size_t n)
-{
-    float *temp = calloc(n, sizeof(float));
-    cuda_pull_array(x_gpu, temp, n);
-    float m = mag_array(temp, n);
-    free(temp);
-    return m;
-}
-#else
-void cuda_set_device(int n){}
-
-#endif
--- a/src/cuda.cpp
+++ b/src/cuda.cpp
@ -0,0 +1,166 @@
+int gpu_index = 0;
+
+#ifdef GPU
+
+#include "hip/hip_runtime.h"
+#include "utils.h"
+#include "blas.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+
+void cuda_set_device(int n) {
+    gpu_index = n;
+    hipError_t status = hipSetDevice(n);
+    check_error(status);
+}
+
+int cuda_get_device() {
+    int n = 0;
+    hipError_t status = hipGetDevice(&n);
+    check_error(status);
+    return n;
+}
+
+void check_error(hipError_t status) {
+    //hipDeviceSynchronize();
+    hipError_t status2 = hipGetLastError();
+    if (status != hipSuccess) {
+        const char *s = hipGetErrorString(status);
+        char buffer[256];
+        printf("CUDA Error: %s\n", s);
+        assert(0);
+        snprintf(buffer, 256, "CUDA Error: %s", s);
+        error(buffer);
+    }
+    if (status2 != hipSuccess) {
+        const char *s = hipGetErrorString(status);
+        char buffer[256];
+        printf("CUDA Error Prev: %s\n", s);
+        assert(0);
+        snprintf(buffer, 256, "CUDA Error Prev: %s", s);
+        error(buffer);
+    }
+}
+
+dim3 cuda_gridsize(size_t n) {
+    size_t k = (n - 1) / BLOCK + 1;
+    size_t x = k;
+    size_t y = 1;
+    if (x > 65535) {
+        x = ceil(sqrt(k));
+        y = (n - 1) / (x * BLOCK) + 1;
+    }
+    dim3 d = {(uint32_t)x, (uint32_t)y, 1};
+    //printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK);
+    return d;
+}
+
+#ifdef CUDNN
+hipdnnHandle_t cudnn_handle()
+{
+    static int init[16] = {0};
+    static hipdnnHandle_t handle[16];
+    int i = cuda_get_device();
+    if(!init[i]) {
+        hipdnnCreate(&handle[i]);
+        init[i] = 1;
+    }
+    return handle[i];
+}
+#endif
+
+hipblasHandle_t blas_handle() {
+    static int init[16] = {0};
+    static hipblasHandle_t handle[16];
+    int i = cuda_get_device();
+    if (!init[i]) {
+        hipblasCreate(&handle[i]);
+        init[i] = 1;
+    }
+    return handle[i];
+}
+
+float *cuda_make_array(float *x, size_t n) {
+    float *x_gpu;
+    size_t size = sizeof(float) * n;
+    hipError_t status = hipMalloc((void **) &x_gpu, size);
+    check_error(status);
+    if (x) {
+        status = hipMemcpy(x_gpu, x, size, hipMemcpyHostToDevice);
+        check_error(status);
+    } else {
+        fill_gpu(n, 0, x_gpu, 1);
+    }
+    if (!x_gpu) error("Cuda malloc failed\n");
+    return x_gpu;
+}
+
+void cuda_random(float *x_gpu, size_t n) {
+    static hiprandGenerator_t gen[16];
+    static int init[16] = {0};
+    int i = cuda_get_device();
+    if (!init[i]) {
+        hiprandCreateGenerator(&gen[i], HIPRAND_RNG_PSEUDO_DEFAULT);
+        hiprandSetPseudoRandomGeneratorSeed(gen[i], time(0));
+        init[i] = 1;
+    }
+    hiprandGenerateUniform(gen[i], x_gpu, n);
+    check_error(hipPeekAtLastError());
+}
+
+float cuda_compare(float *x_gpu, float *x, size_t n, char *s) {
+    float *tmp = (float*)calloc(n, sizeof(float));
+    cuda_pull_array(x_gpu, tmp, n);
+    //int i;
+    //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]);
+    axpy_cpu(n, -1, x, 1, tmp, 1);
+    float err = dot_cpu(n, tmp, 1, tmp, 1);
+    printf("Error %s: %f\n", s, sqrt(err / n));
+    free(tmp);
+    return err;
+}
+
+int *cuda_make_int_array(int *x, size_t n) {
+    int *x_gpu;
+    size_t size = sizeof(int) * n;
+    hipError_t status = hipMalloc((void **) &x_gpu, size);
+    check_error(status);
+    if (x) {
+        status = hipMemcpy(x_gpu, x, size, hipMemcpyHostToDevice);
+        check_error(status);
+    }
+    if (!x_gpu) error("Cuda malloc failed\n");
+    return x_gpu;
+}
+
+void cuda_free(float *x_gpu) {
+    hipError_t status = hipFree(x_gpu);
+    check_error(status);
+}
+
+void cuda_push_array(float *x_gpu, float *x, size_t n) {
+    size_t size = sizeof(float) * n;
+    hipError_t status = hipMemcpy(x_gpu, x, size, hipMemcpyHostToDevice);
+    check_error(status);
+}
+
+void cuda_pull_array(float *x_gpu, float *x, size_t n) {
+    size_t size = sizeof(float) * n;
+    hipError_t status = hipMemcpy(x, x_gpu, size, hipMemcpyDeviceToHost);
+    check_error(status);
+}
+
+float cuda_mag_array(float *x_gpu, size_t n) {
+    float *temp = (float*)calloc(n, sizeof(float));
+    cuda_pull_array(x_gpu, temp, n);
+    float m = mag_array(temp, n);
+    free(temp);
+    return m;
+}
+
+#else
+
+void cuda_set_device(int n) {}
+
+#endif
--- a/src/cuda.h
+++ b/src/cuda.h
@ -5,16 +5,17 @@

 #ifdef GPU

-void check_error(cudaError_t status);
-cublasHandle_t blas_handle();
+void check_error(hipError_t status);
+hipblasHandle_t blas_handle();
 int *cuda_make_int_array(int *x, size_t n);
 void cuda_random(float *x_gpu, size_t n);
 float cuda_compare(float *x_gpu, float *x, size_t n, char *s);
 dim3 cuda_gridsize(size_t n);

 #ifdef CUDNN
-cudnnHandle_t cudnn_handle();
+hipdnnHandle_t cudnn_handle();
 #endif

 #endif
+
 #endif
--- a/src/data.cpp
+++ b/src/data.cpp
--- a/src/data.h
+++ b/src/data.h
@ -1,5 +1,6 @@
 #ifndef DATA_H
 #define DATA_H
+
 #include <pthread.h>

 #include "darknet.h"
@ -8,43 +9,68 @@
 #include "image.h"
 #include "tree.h"

-static inline float distance_from_edge(int x, int max)
-{
-    int dx = (max/2) - x;
+static inline float distance_from_edge(int x, int max) {
+    int dx = (max / 2) - x;
    if (dx < 0) dx = -dx;
-    dx = (max/2) + 1 - dx;
+    dx = (max / 2) + 1 - dx;
    dx *= 2;
-    float dist = (float)dx/max;
+    float dist = (float) dx / max;
    if (dist > 1) dist = 1;
    return dist;
 }
+
 void load_data_blocking(load_args args);


 void print_letters(float *pred, int n);
+
 data load_data_captcha(char **paths, int n, int m, int k, int w, int h);
+
 data load_data_captcha_encode(char **paths, int n, int m, int w, int h);
-data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, int classes, float jitter, float hue, float saturation, float exposure);
-data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
-matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center);
+
+data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, int classes, float jitter, float hue,
+                         float saturation, float exposure);
+
+data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size, float angle, float aspect, float hue,
+                   float saturation, float exposure);
+
+matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue,
+                                float saturation, float exposure, int center);
+
 data load_data_super(char **paths, int n, int m, int w, int h, int scale);
-data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center);
-data load_data_regression(char **paths, int n, int m, int classes, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size,
+                       float angle, float aspect, float hue, float saturation, float exposure, int center);
+
+data
+load_data_regression(char **paths, int n, int m, int classes, int min, int max, int size, float angle, float aspect,
+                     float hue, float saturation, float exposure);
+
 data load_go(char *filename);


 data load_data_writing(char **paths, int n, int m, int w, int h, int out_w, int out_h);

 void get_random_batch(data d, int n, float *X, float *y);
+
 data get_data_part(data d, int part, int total);
+
 data get_random_data(data d, int num);
+
 data load_categorical_data_csv(char *filename, int target, int k);
+
 void normalize_data_rows(data d);
+
 void scale_data_rows(data d, float s);
+
 void translate_data_rows(data d, float s);
+
 void randomize_data(data d);
+
 data *split_data(data d, int part, int total);
+
 data concat_datas(data *d, int n);
+
 void fill_truth(char *path, char **labels, int k, float *truth);

 #endif
--- a/src/deconvolutional_kernels.cu
+++ b/src/deconvolutional_kernels.cu
@ -1,8 +1,6 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
+#include "hiprand.h"
+#include "hipblas.h"

-extern "C" {
 #include "convolutional_layer.h"
 #include "deconvolutional_layer.h"
 #include "batchnorm_layer.h"
@ -11,127 +9,127 @@ extern "C" {
 #include "im2col.h"
 #include "col2im.h"
 #include "utils.h"
-#include "cuda.h"
-}

-extern "C" void forward_deconvolutional_layer_gpu(layer l, network net)
-{
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+void forward_deconvolutional_layer_gpu(layer l, network net) {
    int i;

-    int m = l.size*l.size*l.n;
-    int n = l.h*l.w;
+    int m = l.size * l.size * l.n;
+    int n = l.h * l.w;
    int k = l.c;

-    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    fill_gpu(l.outputs * l.batch, 0, l.output_gpu, 1);

-    for(i = 0; i < l.batch; ++i){
+    for (i = 0; i < l.batch; ++i) {
        float *a = l.weights_gpu;
-        float *b = net.input_gpu + i*l.c*l.h*l.w;
+        float *b = net.input_gpu + i * l.c * l.h * l.w;
        float *c = net.workspace;

-        gemm_gpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
+        gemm_gpu(1, 0, m, n, k, 1, a, m, b, n, 0, c, n);

-        col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs);
+        col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu + i * l.outputs);
    }
    if (l.batch_normalize) {
        forward_batchnorm_layer_gpu(l, net);
    } else {
-        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w * l.out_h);
    }
-    activate_array_gpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation);
+    activate_array_gpu(l.output_gpu, l.batch * l.n * l.out_w * l.out_h, l.activation);
 }

-extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
-{
+void backward_deconvolutional_layer_gpu(layer l, network net) {
    int i;

    //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
-    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    gradient_array_gpu(l.output_gpu, l.outputs * l.batch, l.activation, l.delta_gpu);

-    if(l.batch_normalize){
+    if (l.batch_normalize) {
        backward_batchnorm_layer_gpu(l, net);
    } else {
-        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w * l.out_h);
    }

    //if(net.delta_gpu) memset(net.delta_gpu, 0, l.batch*l.h*l.w*l.c*sizeof(float));

-    for(i = 0; i < l.batch; ++i){
+    for (i = 0; i < l.batch; ++i) {
        int m = l.c;
-        int n = l.size*l.size*l.n;
-        int k = l.h*l.w;
+        int n = l.size * l.size * l.n;
+        int k = l.h * l.w;

-        float *a = net.input_gpu + i*m*k;
+        float *a = net.input_gpu + i * m * k;
        float *b = net.workspace;
        float *c = l.weight_updates_gpu;

-        im2col_gpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w, 
-                l.size, l.stride, l.pad, b);
-        gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+        im2col_gpu(l.delta_gpu + i * l.outputs, l.out_c, l.out_h, l.out_w,
+                   l.size, l.stride, l.pad, b);
+        gemm_gpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);

-        if(net.delta_gpu){
+        if (net.delta_gpu) {
            int m = l.c;
-            int n = l.h*l.w;
-            int k = l.size*l.size*l.n;
+            int n = l.h * l.w;
+            int k = l.size * l.size * l.n;

            float *a = l.weights_gpu;
            float *b = net.workspace;
-            float *c = net.delta_gpu + i*n*m;
+            float *c = net.delta_gpu + i * n * m;

-            gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            gemm_gpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
        }
    }
 }

-extern "C" void pull_deconvolutional_layer(layer l)
-{
-    cuda_pull_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
+void pull_deconvolutional_layer(layer l) {
+    cuda_pull_array(l.weights_gpu, l.weights, l.c * l.n * l.size * l.size);
    cuda_pull_array(l.biases_gpu, l.biases, l.n);
-    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
+    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.c * l.n * l.size * l.size);
    cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
-    if (l.batch_normalize){
+    if (l.batch_normalize) {
        cuda_pull_array(l.scales_gpu, l.scales, l.n);
        cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
        cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
    }
 }

-extern "C" void push_deconvolutional_layer(layer l)
-{
-    cuda_push_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
+void push_deconvolutional_layer(layer l) {
+    cuda_push_array(l.weights_gpu, l.weights, l.c * l.n * l.size * l.size);
    cuda_push_array(l.biases_gpu, l.biases, l.n);
-    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
+    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.c * l.n * l.size * l.size);
    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
-    if (l.batch_normalize){
+    if (l.batch_normalize) {
        cuda_push_array(l.scales_gpu, l.scales, l.n);
        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
    }
 }

-void update_deconvolutional_layer_gpu(layer l, update_args a)
-{
-    float learning_rate = a.learning_rate*l.learning_rate_scale;
+void update_deconvolutional_layer_gpu(layer l, update_args a) {
+    float learning_rate = a.learning_rate * l.learning_rate_scale;
    float momentum = a.momentum;
    float decay = a.decay;
    int batch = a.batch;

-    if(a.adam){
-        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
-        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
-        if(l.scales_gpu){
-            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+    if (a.adam) {
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate,
+                        l.nweights, batch, a.t);
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay,
+                        learning_rate, l.n, batch, a.t);
+        if (l.scales_gpu) {
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay,
+                            learning_rate, l.n, batch, a.t);
        }
-    }else{
-        axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
-        axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+    } else {
+        axpy_gpu(l.nweights, -decay * batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
        scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);

-        axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        axpy_gpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
        scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);

-        if(l.scales_gpu){
-            axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+        if (l.scales_gpu) {
+            axpy_gpu(l.n, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
            scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
        }
    }
--- a/src/deconvolutional_layer.c
+++ b/src/deconvolutional_layer.c
@ -1,312 +0,0 @@
-#include "deconvolutional_layer.h"
-#include "convolutional_layer.h"
-#include "batchnorm_layer.h"
-#include "utils.h"
-#include "im2col.h"
-#include "col2im.h"
-#include "blas.h"
-#include "gemm.h"
-
-#include <stdio.h>
-#include <time.h>
-
-
-static size_t get_workspace_size(layer l){
-    return (size_t)l.h*l.w*l.size*l.size*l.n*sizeof(float);
-}
-
-void bilinear_init(layer l)
-{
-    int i,j,f;
-    float center = (l.size-1) / 2.;
-    for(f = 0; f < l.n; ++f){
-        for(j = 0; j < l.size; ++j){
-            for(i = 0; i < l.size; ++i){
-                float val = (1 - fabs(i - center)) * (1 - fabs(j - center));
-                int c = f%l.c;
-                int ind = f*l.size*l.size*l.c + c*l.size*l.size + j*l.size + i;
-                l.weights[ind] = val;
-            }
-        }
-    }
-}
-
-
-layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam)
-{
-    int i;
-    layer l = {0};
-    l.type = DECONVOLUTIONAL;
-
-    l.h = h;
-    l.w = w;
-    l.c = c;
-    l.n = n;
-    l.batch = batch;
-    l.stride = stride;
-    l.size = size;
-
-    l.nweights = c*n*size*size;
-    l.nbiases = n;
-
-    l.weights = calloc(c*n*size*size, sizeof(float));
-    l.weight_updates = calloc(c*n*size*size, sizeof(float));
-
-    l.biases = calloc(n, sizeof(float));
-    l.bias_updates = calloc(n, sizeof(float));
-    //float scale = n/(size*size*c);
-    //printf("scale: %f\n", scale);
-    float scale = .02;
-    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal();
-    //bilinear_init(l);
-    for(i = 0; i < n; ++i){
-        l.biases[i] = 0;
-    }
-    l.pad = padding;
-
-    l.out_h = (l.h - 1) * l.stride + l.size - 2*l.pad;
-    l.out_w = (l.w - 1) * l.stride + l.size - 2*l.pad;
-    l.out_c = n;
-    l.outputs = l.out_w * l.out_h * l.out_c;
-    l.inputs = l.w * l.h * l.c;
-
-    scal_cpu(l.nweights, (float)l.out_w*l.out_h/(l.w*l.h), l.weights, 1);
-
-    l.output = calloc(l.batch*l.outputs, sizeof(float));
-    l.delta  = calloc(l.batch*l.outputs, sizeof(float));
-
-    l.forward = forward_deconvolutional_layer;
-    l.backward = backward_deconvolutional_layer;
-    l.update = update_deconvolutional_layer;
-
-    l.batch_normalize = batch_normalize;
-
-    if(batch_normalize){
-        l.scales = calloc(n, sizeof(float));
-        l.scale_updates = calloc(n, sizeof(float));
-        for(i = 0; i < n; ++i){
-            l.scales[i] = 1;
-        }
-
-        l.mean = calloc(n, sizeof(float));
-        l.variance = calloc(n, sizeof(float));
-
-        l.mean_delta = calloc(n, sizeof(float));
-        l.variance_delta = calloc(n, sizeof(float));
-
-        l.rolling_mean = calloc(n, sizeof(float));
-        l.rolling_variance = calloc(n, sizeof(float));
-        l.x = calloc(l.batch*l.outputs, sizeof(float));
-        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
-    }
-    if(adam){
-        l.m = calloc(c*n*size*size, sizeof(float));
-        l.v = calloc(c*n*size*size, sizeof(float));
-        l.bias_m = calloc(n, sizeof(float));
-        l.scale_m = calloc(n, sizeof(float));
-        l.bias_v = calloc(n, sizeof(float));
-        l.scale_v = calloc(n, sizeof(float));
-    }
-
-#ifdef GPU
-    l.forward_gpu = forward_deconvolutional_layer_gpu;
-    l.backward_gpu = backward_deconvolutional_layer_gpu;
-    l.update_gpu = update_deconvolutional_layer_gpu;
-
-    if(gpu_index >= 0){
-
-        if (adam) {
-            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
-            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
-            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
-            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
-            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
-            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
-        }
-        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
-
-        l.biases_gpu = cuda_make_array(l.biases, n);
-        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
-
-        l.delta_gpu = cuda_make_array(l.delta, l.batch*l.out_h*l.out_w*n);
-        l.output_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n);
-
-        if(batch_normalize){
-            l.mean_gpu = cuda_make_array(0, n);
-            l.variance_gpu = cuda_make_array(0, n);
-
-            l.rolling_mean_gpu = cuda_make_array(0, n);
-            l.rolling_variance_gpu = cuda_make_array(0, n);
-
-            l.mean_delta_gpu = cuda_make_array(0, n);
-            l.variance_delta_gpu = cuda_make_array(0, n);
-
-            l.scales_gpu = cuda_make_array(l.scales, n);
-            l.scale_updates_gpu = cuda_make_array(0, n);
-
-            l.x_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
-            l.x_norm_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
-        }
-    }
-    #ifdef CUDNN
-        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
-        cudnnCreateTensorDescriptor(&l.normTensorDesc);
-        cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
-        cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
-    #endif
-#endif
-
-    l.activation = activation;
-    l.workspace_size = get_workspace_size(l);
-
-    fprintf(stderr, "deconv%5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
-
-    return l;
-}
-
-void denormalize_deconvolutional_layer(layer l)
-{
-    int i, j;
-    for(i = 0; i < l.n; ++i){
-        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
-        for(j = 0; j < l.c*l.size*l.size; ++j){
-            l.weights[i*l.c*l.size*l.size + j] *= scale;
-        }
-        l.biases[i] -= l.rolling_mean[i] * scale;
-        l.scales[i] = 1;
-        l.rolling_mean[i] = 0;
-        l.rolling_variance[i] = 1;
-    }
-}
-
-void resize_deconvolutional_layer(layer *l, int h, int w)
-{
-    l->h = h;
-    l->w = w;
-    l->out_h = (l->h - 1) * l->stride + l->size - 2*l->pad;
-    l->out_w = (l->w - 1) * l->stride + l->size - 2*l->pad;
-
-    l->outputs = l->out_h * l->out_w * l->out_c;
-    l->inputs = l->w * l->h * l->c;
-
-    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
-    l->delta  = realloc(l->delta,  l->batch*l->outputs*sizeof(float));
-    if(l->batch_normalize){
-        l->x = realloc(l->x, l->batch*l->outputs*sizeof(float));
-        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
-    }
-
-#ifdef GPU
-    cuda_free(l->delta_gpu);
-    cuda_free(l->output_gpu);
-
-    l->delta_gpu =  cuda_make_array(l->delta,  l->batch*l->outputs);
-    l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
-
-    if(l->batch_normalize){
-        cuda_free(l->x_gpu);
-        cuda_free(l->x_norm_gpu);
-
-        l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
-        l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
-    }
-    #ifdef CUDNN
-        cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-        cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
-    #endif
-#endif
-    l->workspace_size = get_workspace_size(*l);
-}
-
-void forward_deconvolutional_layer(const layer l, network net)
-{
-    int i;
-
-    int m = l.size*l.size*l.n;
-    int n = l.h*l.w;
-    int k = l.c;
-
-    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
-
-    for(i = 0; i < l.batch; ++i){
-        float *a = l.weights;
-        float *b = net.input + i*l.c*l.h*l.w;
-        float *c = net.workspace;
-
-        gemm_cpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
-
-        col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output+i*l.outputs);
-    }
-    if (l.batch_normalize) {
-        forward_batchnorm_layer(l, net);
-    } else {
-        add_bias(l.output, l.biases, l.batch, l.n, l.out_w*l.out_h);
-    }
-    activate_array(l.output, l.batch*l.n*l.out_w*l.out_h, l.activation);
-}
-
-void backward_deconvolutional_layer(layer l, network net)
-{
-    int i;
-
-    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-
-    if(l.batch_normalize){
-        backward_batchnorm_layer(l, net);
-    } else {
-        backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h);
-    }
-
-    //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
-
-    for(i = 0; i < l.batch; ++i){
-        int m = l.c;
-        int n = l.size*l.size*l.n;
-        int k = l.h*l.w;
-
-        float *a = net.input + i*m*k;
-        float *b = net.workspace;
-        float *c = l.weight_updates;
-
-        im2col_cpu(l.delta + i*l.outputs, l.out_c, l.out_h, l.out_w, 
-                l.size, l.stride, l.pad, b);
-        gemm_cpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
-
-        if(net.delta){
-            int m = l.c;
-            int n = l.h*l.w;
-            int k = l.size*l.size*l.n;
-
-            float *a = l.weights;
-            float *b = net.workspace;
-            float *c = net.delta + i*n*m;
-
-            gemm_cpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
-        }
-    }
-}
-
-void update_deconvolutional_layer(layer l, update_args a)
-{
-    float learning_rate = a.learning_rate*l.learning_rate_scale;
-    float momentum = a.momentum;
-    float decay = a.decay;
-    int batch = a.batch;
-
-    int size = l.size*l.size*l.c*l.n;
-    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
-    scal_cpu(l.n, momentum, l.bias_updates, 1);
-
-    if(l.scales){
-        axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
-        scal_cpu(l.n, momentum, l.scale_updates, 1);
-    }
-
-    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
-    scal_cpu(size, momentum, l.weight_updates, 1);
-}
-
-
-
--- a/src/deconvolutional_layer.cpp
+++ b/src/deconvolutional_layer.cpp
@ -0,0 +1,307 @@
+#include "deconvolutional_layer.h"
+#include "convolutional_layer.h"
+#include "batchnorm_layer.h"
+#include "utils.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <stdio.h>
+#include <time.h>
+
+
+static size_t get_workspace_size(layer l) {
+    return (size_t) l.h * l.w * l.size * l.size * l.n * sizeof(float);
+}
+
+void bilinear_init(layer l) {
+    int i, j, f;
+    float center = (l.size - 1) / 2.;
+    for (f = 0; f < l.n; ++f) {
+        for (j = 0; j < l.size; ++j) {
+            for (i = 0; i < l.size; ++i) {
+                float val = (1 - fabs(i - center)) * (1 - fabs(j - center));
+                int c = f % l.c;
+                int ind = f * l.size * l.size * l.c + c * l.size * l.size + j * l.size + i;
+                l.weights[ind] = val;
+            }
+        }
+    }
+}
+
+
+layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding,
+                                 ACTIVATION activation, int batch_normalize, int adam) {
+    int i;
+    layer l = {(LAYER_TYPE) 0};
+    l.type = DECONVOLUTIONAL;
+
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.n = n;
+    l.batch = batch;
+    l.stride = stride;
+    l.size = size;
+
+    l.nweights = c * n * size * size;
+    l.nbiases = n;
+
+    l.weights = (float *) calloc(c * n * size * size, sizeof(float));
+    l.weight_updates = (float *) calloc(c * n * size * size, sizeof(float));
+
+    l.biases = (float *) calloc(n, sizeof(float));
+    l.bias_updates = (float *) calloc(n, sizeof(float));
+    //float scale = n/(size*size*c);
+    //printf("scale: %f\n", scale);
+    float scale = .02;
+    for (i = 0; i < c * n * size * size; ++i) l.weights[i] = scale * rand_normal();
+    //bilinear_init(l);
+    for (i = 0; i < n; ++i) {
+        l.biases[i] = 0;
+    }
+    l.pad = padding;
+
+    l.out_h = (l.h - 1) * l.stride + l.size - 2 * l.pad;
+    l.out_w = (l.w - 1) * l.stride + l.size - 2 * l.pad;
+    l.out_c = n;
+    l.outputs = l.out_w * l.out_h * l.out_c;
+    l.inputs = l.w * l.h * l.c;
+
+    scal_cpu(l.nweights, (float) l.out_w * l.out_h / (l.w * l.h), l.weights, 1);
+
+    l.output = (float *) calloc(l.batch * l.outputs, sizeof(float));
+    l.delta = (float *) calloc(l.batch * l.outputs, sizeof(float));
+
+    l.forward = forward_deconvolutional_layer;
+    l.backward = backward_deconvolutional_layer;
+    l.update = update_deconvolutional_layer;
+
+    l.batch_normalize = batch_normalize;
+
+    if (batch_normalize) {
+        l.scales = (float *) calloc(n, sizeof(float));
+        l.scale_updates = (float *) calloc(n, sizeof(float));
+        for (i = 0; i < n; ++i) {
+            l.scales[i] = 1;
+        }
+
+        l.mean = (float *) calloc(n, sizeof(float));
+        l.variance = (float *) calloc(n, sizeof(float));
+
+        l.mean_delta = (float *) calloc(n, sizeof(float));
+        l.variance_delta = (float *) calloc(n, sizeof(float));
+
+        l.rolling_mean = (float *) calloc(n, sizeof(float));
+        l.rolling_variance = (float *) calloc(n, sizeof(float));
+        l.x = (float *) calloc(l.batch * l.outputs, sizeof(float));
+        l.x_norm = (float *) calloc(l.batch * l.outputs, sizeof(float));
+    }
+    if (adam) {
+        l.m = (float *) calloc(c * n * size * size, sizeof(float));
+        l.v = (float *) calloc(c * n * size * size, sizeof(float));
+        l.bias_m = (float *) calloc(n, sizeof(float));
+        l.scale_m = (float *) calloc(n, sizeof(float));
+        l.bias_v = (float *) calloc(n, sizeof(float));
+        l.scale_v = (float *) calloc(n, sizeof(float));
+    }
+
+#ifdef GPU
+    l.forward_gpu = forward_deconvolutional_layer_gpu;
+    l.backward_gpu = backward_deconvolutional_layer_gpu;
+    l.update_gpu = update_deconvolutional_layer_gpu;
+
+    if(gpu_index >= 0){
+
+        if (adam) {
+            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
+            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
+            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
+            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
+            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
+            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
+        }
+        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+
+        l.biases_gpu = cuda_make_array(l.biases, n);
+        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+
+        l.delta_gpu = cuda_make_array(l.delta, l.batch*l.out_h*l.out_w*n);
+        l.output_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n);
+
+        if(batch_normalize){
+            l.mean_gpu = cuda_make_array(0, n);
+            l.variance_gpu = cuda_make_array(0, n);
+
+            l.rolling_mean_gpu = cuda_make_array(0, n);
+            l.rolling_variance_gpu = cuda_make_array(0, n);
+
+            l.mean_delta_gpu = cuda_make_array(0, n);
+            l.variance_delta_gpu = cuda_make_array(0, n);
+
+            l.scales_gpu = cuda_make_array(l.scales, n);
+            l.scale_updates_gpu = cuda_make_array(0, n);
+
+            l.x_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
+            l.x_norm_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
+        }
+    }
+#ifdef CUDNN
+        hipdnnCreateTensorDescriptor(&l.dstTensorDesc);
+        hipdnnCreateTensorDescriptor(&l.normTensorDesc);
+        hipdnnSetTensor4dDescriptor(l.dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w);
+        hipdnnSetTensor4dDescriptor(l.normTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, 1, l.out_c, 1, 1);
+#endif
+#endif
+
+    l.activation = activation;
+    l.workspace_size = get_workspace_size(l);
+
+    fprintf(stderr, "deconv%5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c,
+            l.out_w, l.out_h, l.out_c);
+
+    return l;
+}
+
+void denormalize_deconvolutional_layer(layer l) {
+    int i, j;
+    for (i = 0; i < l.n; ++i) {
+        float scale = l.scales[i] / sqrt(l.rolling_variance[i] + .00001);
+        for (j = 0; j < l.c * l.size * l.size; ++j) {
+            l.weights[i * l.c * l.size * l.size + j] *= scale;
+        }
+        l.biases[i] -= l.rolling_mean[i] * scale;
+        l.scales[i] = 1;
+        l.rolling_mean[i] = 0;
+        l.rolling_variance[i] = 1;
+    }
+}
+
+void resize_deconvolutional_layer(layer *l, int h, int w) {
+    l->h = h;
+    l->w = w;
+    l->out_h = (l->h - 1) * l->stride + l->size - 2 * l->pad;
+    l->out_w = (l->w - 1) * l->stride + l->size - 2 * l->pad;
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->w * l->h * l->c;
+
+    l->output = (float *) realloc(l->output, l->batch * l->outputs * sizeof(float));
+    l->delta = (float *) realloc(l->delta, l->batch * l->outputs * sizeof(float));
+    if (l->batch_normalize) {
+        l->x = (float *) realloc(l->x, l->batch * l->outputs * sizeof(float));
+        l->x_norm = (float *) realloc(l->x_norm, l->batch * l->outputs * sizeof(float));
+    }
+
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =  cuda_make_array(l->delta,  l->batch*l->outputs);
+    l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+
+    if(l->batch_normalize){
+        cuda_free(l->x_gpu);
+        cuda_free(l->x_norm_gpu);
+
+        l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+        l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+    }
+#ifdef CUDNN
+        hipdnnSetTensor4dDescriptor(l->dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
+        hipdnnSetTensor4dDescriptor(l->normTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
+#endif
+#endif
+    l->workspace_size = get_workspace_size(*l);
+}
+
+void forward_deconvolutional_layer(const layer l, network net) {
+    int i;
+
+    int m = l.size * l.size * l.n;
+    int n = l.h * l.w;
+    int k = l.c;
+
+    fill_cpu(l.outputs * l.batch, 0, l.output, 1);
+
+    for (i = 0; i < l.batch; ++i) {
+        float *a = l.weights;
+        float *b = net.input + i * l.c * l.h * l.w;
+        float *c = net.workspace;
+
+        gemm_cpu(1, 0, m, n, k, 1, a, m, b, n, 0, c, n);
+
+        col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output + i * l.outputs);
+    }
+    if (l.batch_normalize) {
+        forward_batchnorm_layer(l, net);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.n, l.out_w * l.out_h);
+    }
+    activate_array(l.output, l.batch * l.n * l.out_w * l.out_h, l.activation);
+}
+
+void backward_deconvolutional_layer(layer l, network net) {
+    int i;
+
+    gradient_array(l.output, l.outputs * l.batch, l.activation, l.delta);
+
+    if (l.batch_normalize) {
+        backward_batchnorm_layer(l, net);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w * l.out_h);
+    }
+
+    //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
+
+    for (i = 0; i < l.batch; ++i) {
+        int m = l.c;
+        int n = l.size * l.size * l.n;
+        int k = l.h * l.w;
+
+        float *a = net.input + i * m * k;
+        float *b = net.workspace;
+        float *c = l.weight_updates;
+
+        im2col_cpu(l.delta + i * l.outputs, l.out_c, l.out_h, l.out_w,
+                   l.size, l.stride, l.pad, b);
+        gemm_cpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
+
+        if (net.delta) {
+            int m = l.c;
+            int n = l.h * l.w;
+            int k = l.size * l.size * l.n;
+
+            float *a = l.weights;
+            float *b = net.workspace;
+            float *c = net.delta + i * n * m;
+
+            gemm_cpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
+        }
+    }
+}
+
+void update_deconvolutional_layer(layer l, update_args a) {
+    float learning_rate = a.learning_rate * l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
+    int size = l.size * l.size * l.c * l.n;
+    axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.n, momentum, l.bias_updates, 1);
+
+    if (l.scales) {
+        axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.n, momentum, l.scale_updates, 1);
+    }
+
+    axpy_cpu(size, -decay * batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(size, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(size, momentum, l.weight_updates, 1);
+}
+
+
+
--- a/src/deconvolutional_layer.h
+++ b/src/deconvolutional_layer.h
@ -1,13 +1,14 @@
 #ifndef DECONVOLUTIONAL_LAYER_H
 #define DECONVOLUTIONAL_LAYER_H

-#include "cuda.h"
 #include "image.h"
 #include "activations.h"
 #include "layer.h"
 #include "network.h"

 #ifdef GPU
+
+#include "hip/hip_runtime.h"
 void forward_deconvolutional_layer_gpu(layer l, network net);
 void backward_deconvolutional_layer_gpu(layer l, network net);
 void update_deconvolutional_layer_gpu(layer l, update_args a);
@ -15,10 +16,15 @@ void push_deconvolutional_layer(layer l);
 void pull_deconvolutional_layer(layer l);
 #endif

-layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam);
+layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding,
+                                 ACTIVATION activation, int batch_normalize, int adam);
+
 void resize_deconvolutional_layer(layer *l, int h, int w);
+
 void forward_deconvolutional_layer(const layer l, network net);
+
 void update_deconvolutional_layer(layer l, update_args a);
+
 void backward_deconvolutional_layer(layer l, network net);

 #endif
--- a/src/demo.cpp
+++ b/src/demo.cpp
@ -203,11 +203,11 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch

    int i;
    demo_total = size_network(net);
-    predictions = calloc(demo_frame, sizeof(float*));
+    predictions = (float **) calloc(demo_frame, sizeof(float*));
    for (i = 0; i < demo_frame; ++i){
-        predictions[i] = calloc(demo_total, sizeof(float));
+        predictions[i] = (float *) calloc(demo_total, sizeof(float));
    }
-    avg = calloc(demo_total, sizeof(float));
+    avg = (float *) calloc(demo_total, sizeof(float));

    if(filename){
        printf("video file: %s\n", filename);
@ -255,7 +255,7 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch
   void demo_compare(char *cfg1, char *weight1, char *cfg2, char *weight2, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg_frames, float hier, int w, int h, int frames, int fullscreen)
   {
   demo_frame = avg_frames;
-   predictions = calloc(demo_frame, sizeof(float*));
+   predictions = (float**)calloc(demo_frame, sizeof(float*));
   image **alphabet = load_alphabet();
   demo_names = names;
   demo_alphabet = alphabet;
@ -341,9 +341,11 @@ pthread_join(detect_thread, 0);
 }
 */
 #else
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg, float hier, int w, int h, int frames, int fullscreen)
-{
+
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes,
+          int delay, char *prefix, int avg, float hier, int w, int h, int frames, int fullscreen) {
    fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
 }
+
 #endif

--- a/src/detection_layer.cpp
+++ b/src/detection_layer.cpp
@ -3,7 +3,6 @@
 #include "softmax_layer.h"
 #include "blas.h"
 #include "box.h"
-#include "cuda.h"
 #include "utils.h"

 #include <stdio.h>
@ -11,9 +10,12 @@
 #include <string.h>
 #include <stdlib.h>

-detection_layer make_detection_layer(int batch, int inputs, int n, int side, int classes, int coords, int rescore)
-{
-    detection_layer l = {0};
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+detection_layer make_detection_layer(int batch, int inputs, int n, int side, int classes, int coords, int rescore) {
+    detection_layer l = {(LAYER_TYPE)0};
    l.type = DETECTION;

    l.n = n;
@ -25,12 +27,12 @@ detection_layer make_detection_layer(int batch, int inputs, int n, int side, int
    l.side = side;
    l.w = side;
    l.h = side;
-    assert(side*side*((1 + l.coords)*l.n + l.classes) == inputs);
-    l.cost = calloc(1, sizeof(float));
+    assert(side * side * ((1 + l.coords) * l.n + l.classes) == inputs);
+    l.cost = (float *) calloc(1, sizeof(float));
    l.outputs = l.inputs;
-    l.truths = l.side*l.side*(1+l.coords+l.classes);
-    l.output = calloc(batch*l.outputs, sizeof(float));
-    l.delta = calloc(batch*l.outputs, sizeof(float));
+    l.truths = l.side * l.side * (1 + l.coords + l.classes);
+    l.output = (float *) calloc(batch * l.outputs, sizeof(float));
+    l.delta = (float *) calloc(batch * l.outputs, sizeof(float));

    l.forward = forward_detection_layer;
    l.backward = backward_detection_layer;
@ -47,24 +49,23 @@ detection_layer make_detection_layer(int batch, int inputs, int n, int side, int
    return l;
 }

-void forward_detection_layer(const detection_layer l, network net)
-{
-    int locations = l.side*l.side;
-    int i,j;
-    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+void forward_detection_layer(const detection_layer l, network net) {
+    int locations = l.side * l.side;
+    int i, j;
+    memcpy(l.output, net.input, l.outputs * l.batch * sizeof(float));
    //if(l.reorg) reorg(l.output, l.w*l.h, size*l.n, l.batch, 1);
    int b;
-    if (l.softmax){
-        for(b = 0; b < l.batch; ++b){
-            int index = b*l.inputs;
+    if (l.softmax) {
+        for (b = 0; b < l.batch; ++b) {
+            int index = b * l.inputs;
            for (i = 0; i < locations; ++i) {
-                int offset = i*l.classes;
+                int offset = i * l.classes;
                softmax(l.output + index + offset, l.classes, 1, 1,
                        l.output + index + offset);
            }
        }
    }
-    if(net.train){
+    if (net.train) {
        float avg_iou = 0;
        float avg_cat = 0;
        float avg_allcat = 0;
@ -74,15 +75,15 @@ void forward_detection_layer(const detection_layer l, network net)
        *(l.cost) = 0;
        int size = l.inputs * l.batch;
        memset(l.delta, 0, size * sizeof(float));
-        for (b = 0; b < l.batch; ++b){
-            int index = b*l.inputs;
+        for (b = 0; b < l.batch; ++b) {
+            int index = b * l.inputs;
            for (i = 0; i < locations; ++i) {
-                int truth_index = (b*locations + i)*(1+l.coords+l.classes);
+                int truth_index = (b * locations + i) * (1 + l.coords + l.classes);
                int is_obj = net.truth[truth_index];
                for (j = 0; j < l.n; ++j) {
-                    int p_index = index + locations*l.classes + i*l.n + j;
-                    l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]);
-                    *(l.cost) += l.noobject_scale*pow(l.output[p_index], 2);
+                    int p_index = index + locations * l.classes + i * l.n + j;
+                    l.delta[p_index] = l.noobject_scale * (0 - l.output[p_index]);
+                    *(l.cost) += l.noobject_scale * pow(l.output[p_index], 2);
                    avg_anyobj += l.output[p_index];
                }

@ -90,118 +91,121 @@ void forward_detection_layer(const detection_layer l, network net)
                float best_iou = 0;
                float best_rmse = 20;

-                if (!is_obj){
+                if (!is_obj) {
                    continue;
                }

-                int class_index = index + i*l.classes;
-                for(j = 0; j < l.classes; ++j) {
-                    l.delta[class_index+j] = l.class_scale * (net.truth[truth_index+1+j] - l.output[class_index+j]);
-                    *(l.cost) += l.class_scale * pow(net.truth[truth_index+1+j] - l.output[class_index+j], 2);
-                    if(net.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
-                    avg_allcat += l.output[class_index+j];
+                int class_index = index + i * l.classes;
+                for (j = 0; j < l.classes; ++j) {
+                    l.delta[class_index + j] =
+                            l.class_scale * (net.truth[truth_index + 1 + j] - l.output[class_index + j]);
+                    *(l.cost) += l.class_scale * pow(net.truth[truth_index + 1 + j] - l.output[class_index + j], 2);
+                    if (net.truth[truth_index + 1 + j]) avg_cat += l.output[class_index + j];
+                    avg_allcat += l.output[class_index + j];
                }

                box truth = float_to_box(net.truth + truth_index + 1 + l.classes, 1);
                truth.x /= l.side;
                truth.y /= l.side;

-                for(j = 0; j < l.n; ++j){
-                    int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
+                for (j = 0; j < l.n; ++j) {
+                    int box_index = index + locations * (l.classes + l.n) + (i * l.n + j) * l.coords;
                    box out = float_to_box(l.output + box_index, 1);
                    out.x /= l.side;
                    out.y /= l.side;

-                    if (l.sqrt){
-                        out.w = out.w*out.w;
-                        out.h = out.h*out.h;
+                    if (l.sqrt) {
+                        out.w = out.w * out.w;
+                        out.h = out.h * out.h;
                    }

-                    float iou  = box_iou(out, truth);
+                    float iou = box_iou(out, truth);
                    //iou = 0;
                    float rmse = box_rmse(out, truth);
-                    if(best_iou > 0 || iou > 0){
-                        if(iou > best_iou){
+                    if (best_iou > 0 || iou > 0) {
+                        if (iou > best_iou) {
                            best_iou = iou;
                            best_index = j;
                        }
-                    }else{
-                        if(rmse < best_rmse){
+                    } else {
+                        if (rmse < best_rmse) {
                            best_rmse = rmse;
                            best_index = j;
                        }
                    }
                }

-                if(l.forced){
-                    if(truth.w*truth.h < .1){
+                if (l.forced) {
+                    if (truth.w * truth.h < .1) {
                        best_index = 1;
-                    }else{
+                    } else {
                        best_index = 0;
                    }
                }
-                if(l.random && *(net.seen) < 64000){
-                    best_index = rand()%l.n;
+                if (l.random && *(net.seen) < 64000) {
+                    best_index = rand() % l.n;
                }

-                int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
+                int box_index = index + locations * (l.classes + l.n) + (i * l.n + best_index) * l.coords;
                int tbox_index = truth_index + 1 + l.classes;

                box out = float_to_box(l.output + box_index, 1);
                out.x /= l.side;
                out.y /= l.side;
                if (l.sqrt) {
-                    out.w = out.w*out.w;
-                    out.h = out.h*out.h;
+                    out.w = out.w * out.w;
+                    out.h = out.h * out.h;
                }
-                float iou  = box_iou(out, truth);
+                float iou = box_iou(out, truth);

                //printf("%d,", best_index);
-                int p_index = index + locations*l.classes + i*l.n + best_index;
+                int p_index = index + locations * l.classes + i * l.n + best_index;
                *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2);
-                *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2);
+                *(l.cost) += l.object_scale * pow(1 - l.output[p_index], 2);
                avg_obj += l.output[p_index];
-                l.delta[p_index] = l.object_scale * (1.-l.output[p_index]);
+                l.delta[p_index] = l.object_scale * (1. - l.output[p_index]);

-                if(l.rescore){
+                if (l.rescore) {
                    l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
                }

-                l.delta[box_index+0] = l.coord_scale*(net.truth[tbox_index + 0] - l.output[box_index + 0]);
-                l.delta[box_index+1] = l.coord_scale*(net.truth[tbox_index + 1] - l.output[box_index + 1]);
-                l.delta[box_index+2] = l.coord_scale*(net.truth[tbox_index + 2] - l.output[box_index + 2]);
-                l.delta[box_index+3] = l.coord_scale*(net.truth[tbox_index + 3] - l.output[box_index + 3]);
-                if(l.sqrt){
-                    l.delta[box_index+2] = l.coord_scale*(sqrt(net.truth[tbox_index + 2]) - l.output[box_index + 2]);
-                    l.delta[box_index+3] = l.coord_scale*(sqrt(net.truth[tbox_index + 3]) - l.output[box_index + 3]);
+                l.delta[box_index + 0] = l.coord_scale * (net.truth[tbox_index + 0] - l.output[box_index + 0]);
+                l.delta[box_index + 1] = l.coord_scale * (net.truth[tbox_index + 1] - l.output[box_index + 1]);
+                l.delta[box_index + 2] = l.coord_scale * (net.truth[tbox_index + 2] - l.output[box_index + 2]);
+                l.delta[box_index + 3] = l.coord_scale * (net.truth[tbox_index + 3] - l.output[box_index + 3]);
+                if (l.sqrt) {
+                    l.delta[box_index + 2] =
+                            l.coord_scale * (sqrt(net.truth[tbox_index + 2]) - l.output[box_index + 2]);
+                    l.delta[box_index + 3] =
+                            l.coord_scale * (sqrt(net.truth[tbox_index + 3]) - l.output[box_index + 3]);
                }

-                *(l.cost) += pow(1-iou, 2);
+                *(l.cost) += pow(1 - iou, 2);
                avg_iou += iou;
                ++count;
            }
        }

-        if(0){
-            float *costs = calloc(l.batch*locations*l.n, sizeof(float));
+        if (0) {
+            float *costs = (float *) calloc(l.batch * locations * l.n, sizeof(float));
            for (b = 0; b < l.batch; ++b) {
-                int index = b*l.inputs;
+                int index = b * l.inputs;
                for (i = 0; i < locations; ++i) {
                    for (j = 0; j < l.n; ++j) {
-                        int p_index = index + locations*l.classes + i*l.n + j;
-                        costs[b*locations*l.n + i*l.n + j] = l.delta[p_index]*l.delta[p_index];
+                        int p_index = index + locations * l.classes + i * l.n + j;
+                        costs[b * locations * l.n + i * l.n + j] = l.delta[p_index] * l.delta[p_index];
                    }
                }
            }
            int indexes[100];
-            top_k(costs, l.batch*locations*l.n, 100, indexes);
+            top_k(costs, l.batch * locations * l.n, 100, indexes);
            float cutoff = costs[indexes[99]];
            for (b = 0; b < l.batch; ++b) {
-                int index = b*l.inputs;
+                int index = b * l.inputs;
                for (i = 0; i < locations; ++i) {
                    for (j = 0; j < l.n; ++j) {
-                        int p_index = index + locations*l.classes + i*l.n + j;
-                        if (l.delta[p_index]*l.delta[p_index] < cutoff) l.delta[p_index] = 0;
+                        int p_index = index + locations * l.classes + i * l.n + j;
+                        if (l.delta[p_index] * l.delta[p_index] < cutoff) l.delta[p_index] = 0;
                    }
                }
            }
@ -212,39 +216,39 @@ void forward_detection_layer(const detection_layer l, network net)
        *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);


-        printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count);
+        printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n",
+               avg_iou / count, avg_cat / count, avg_allcat / (count * l.classes), avg_obj / count,
+               avg_anyobj / (l.batch * locations * l.n), count);
        //if(l.reorg) reorg(l.delta, l.w*l.h, size*l.n, l.batch, 0);
    }
 }

-void backward_detection_layer(const detection_layer l, network net)
-{
-    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+void backward_detection_layer(const detection_layer l, network net) {
+    axpy_cpu(l.batch * l.inputs, 1, l.delta, 1, net.delta, 1);
 }

-void get_detection_detections(layer l, int w, int h, float thresh, detection *dets)
-{
-    int i,j,n;
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets) {
+    int i, j, n;
    float *predictions = l.output;
    //int per_cell = 5*num+classes;
-    for (i = 0; i < l.side*l.side; ++i){
+    for (i = 0; i < l.side * l.side; ++i) {
        int row = i / l.side;
        int col = i % l.side;
-        for(n = 0; n < l.n; ++n){
-            int index = i*l.n + n;
-            int p_index = l.side*l.side*l.classes + i*l.n + n;
+        for (n = 0; n < l.n; ++n) {
+            int index = i * l.n + n;
+            int p_index = l.side * l.side * l.classes + i * l.n + n;
            float scale = predictions[p_index];
-            int box_index = l.side*l.side*(l.classes + l.n) + (i*l.n + n)*4;
+            int box_index = l.side * l.side * (l.classes + l.n) + (i * l.n + n) * 4;
            box b;
            b.x = (predictions[box_index + 0] + col) / l.side * w;
            b.y = (predictions[box_index + 1] + row) / l.side * h;
-            b.w = pow(predictions[box_index + 2], (l.sqrt?2:1)) * w;
-            b.h = pow(predictions[box_index + 3], (l.sqrt?2:1)) * h;
+            b.w = pow(predictions[box_index + 2], (l.sqrt ? 2 : 1)) * w;
+            b.h = pow(predictions[box_index + 3], (l.sqrt ? 2 : 1)) * h;
            dets[index].bbox = b;
            dets[index].objectness = scale;
-            for(j = 0; j < l.classes; ++j){
-                int class_index = i*l.classes;
-                float prob = scale*predictions[class_index+j];
+            for (j = 0; j < l.classes; ++j) {
+                int class_index = i * l.classes;
+                float prob = scale * predictions[class_index + j];
                dets[index].prob[j] = (prob > thresh) ? prob : 0;
            }
        }
--- a/src/detection_layer.h
+++ b/src/detection_layer.h
@ -7,7 +7,9 @@
 typedef layer detection_layer;

 detection_layer make_detection_layer(int batch, int inputs, int n, int size, int classes, int coords, int rescore);
+
 void forward_detection_layer(const detection_layer l, network net);
+
 void backward_detection_layer(const detection_layer l, network net);

 #ifdef GPU
--- a/src/dropout_layer.cpp
+++ b/src/dropout_layer.cpp
@ -1,59 +1,58 @@
 #include "dropout_layer.h"
 #include "utils.h"
-#include "cuda.h"
 #include <stdlib.h>
 #include <stdio.h>

-dropout_layer make_dropout_layer(int batch, int inputs, float probability)
-{
-    dropout_layer l = {0};
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+dropout_layer make_dropout_layer(int batch, int inputs, float probability) {
+    dropout_layer l = {(LAYER_TYPE) 0};
    l.type = DROPOUT;
    l.probability = probability;
    l.inputs = inputs;
    l.outputs = inputs;
    l.batch = batch;
-    l.rand = calloc(inputs*batch, sizeof(float));
-    l.scale = 1./(1.-probability);
+    l.rand = (float *) calloc(inputs * batch, sizeof(float));
+    l.scale = 1. / (1. - probability);
    l.forward = forward_dropout_layer;
    l.backward = backward_dropout_layer;
-    #ifdef GPU
+#ifdef GPU
    l.forward_gpu = forward_dropout_layer_gpu;
    l.backward_gpu = backward_dropout_layer_gpu;
    l.rand_gpu = cuda_make_array(l.rand, inputs*batch);
-    #endif
+#endif
    fprintf(stderr, "dropout       p = %.2f               %4d  ->  %4d\n", probability, inputs, inputs);
    return l;
-} 
+}

-void resize_dropout_layer(dropout_layer *l, int inputs)
-{
-    l->rand = realloc(l->rand, l->inputs*l->batch*sizeof(float));
-    #ifdef GPU
+void resize_dropout_layer(dropout_layer *l, int inputs) {
+    l->rand = (float *) realloc(l->rand, l->inputs * l->batch * sizeof(float));
+#ifdef GPU
    cuda_free(l->rand_gpu);

    l->rand_gpu = cuda_make_array(l->rand, inputs*l->batch);
-    #endif
+#endif
 }

-void forward_dropout_layer(dropout_layer l, network net)
-{
+void forward_dropout_layer(dropout_layer l, network net) {
    int i;
    if (!net.train) return;
-    for(i = 0; i < l.batch * l.inputs; ++i){
+    for (i = 0; i < l.batch * l.inputs; ++i) {
        float r = rand_uniform(0, 1);
        l.rand[i] = r;
-        if(r < l.probability) net.input[i] = 0;
+        if (r < l.probability) net.input[i] = 0;
        else net.input[i] *= l.scale;
    }
 }

-void backward_dropout_layer(dropout_layer l, network net)
-{
+void backward_dropout_layer(dropout_layer l, network net) {
    int i;
-    if(!net.delta) return;
-    for(i = 0; i < l.batch * l.inputs; ++i){
+    if (!net.delta) return;
+    for (i = 0; i < l.batch * l.inputs; ++i) {
        float r = l.rand[i];
-        if(r < l.probability) net.delta[i] = 0;
+        if (r < l.probability) net.delta[i] = 0;
        else net.delta[i] *= l.scale;
    }
 }
--- a/src/dropout_layer.h
+++ b/src/dropout_layer.h
@ -9,7 +9,9 @@ typedef layer dropout_layer;
 dropout_layer make_dropout_layer(int batch, int inputs, float probability);

 void forward_dropout_layer(dropout_layer l, network net);
+
 void backward_dropout_layer(dropout_layer l, network net);
+
 void resize_dropout_layer(dropout_layer *l, int inputs);

 #ifdef GPU
--- a/src/dropout_layer_kernels.cu
+++ b/src/dropout_layer_kernels.cu
@ -1,23 +1,22 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
+#include "hiprand.h"
+#include "hipblas.h"

-extern "C" {
 #include "dropout_layer.h"
-#include "cuda.h"
+#include "hip/hip_runtime.h"
 #include "utils.h"
+
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale) {
+    int id = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (id < size) input[id] = (rand[id] < prob) ? 0 : input[id] * scale;
 }

-__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
-{
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale;
-}
-
-void forward_dropout_layer_gpu(dropout_layer layer, network net)
-{
+void forward_dropout_layer_gpu(dropout_layer layer, network net) {
    if (!net.train) return;
-    int size = layer.inputs*layer.batch;
+    int size = layer.inputs * layer.batch;
    cuda_random(layer.rand_gpu, size);
    /*
    int i;
@ -27,15 +26,16 @@ void forward_dropout_layer_gpu(dropout_layer layer, network net)
    cuda_push_array(layer.rand_gpu, layer.rand, size);
    */

-    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
-    check_error(cudaPeekAtLastError());
+    yoloswag420blazeit360noscope<<<cuda_gridsize(
+            size), BLOCK>>>(net.input_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
+    check_error(hipPeekAtLastError());
 }

-void backward_dropout_layer_gpu(dropout_layer layer, network net)
-{
-    if(!net.delta_gpu) return;
-    int size = layer.inputs*layer.batch;
+void backward_dropout_layer_gpu(dropout_layer layer, network net) {
+    if (!net.delta_gpu) return;
+    int size = layer.inputs * layer.batch;

-    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.delta_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
-    check_error(cudaPeekAtLastError());
+    yoloswag420blazeit360noscope<<<cuda_gridsize(
+            size), BLOCK>>>(net.delta_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
+    check_error(hipPeekAtLastError());
 }
--- a/src/gemm.c
+++ b/src/gemm.c
@ -1,324 +0,0 @@
-#include "gemm.h"
-#include "utils.h"
-#include "cuda.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-
-void gemm_bin(int M, int N, int K, float ALPHA, 
-        char  *A, int lda, 
-        float *B, int ldb,
-        float *C, int ldc)
-{
-    int i,j,k;
-    for(i = 0; i < M; ++i){
-        for(k = 0; k < K; ++k){
-            char A_PART = A[i*lda+k];
-            if(A_PART){
-                for(j = 0; j < N; ++j){
-                    C[i*ldc+j] += B[k*ldb+j];
-                }
-            } else {
-                for(j = 0; j < N; ++j){
-                    C[i*ldc+j] -= B[k*ldb+j];
-                }
-            }
-        }
-    }
-}
-
-float *random_matrix(int rows, int cols)
-{
-    int i;
-    float *m = calloc(rows*cols, sizeof(float));
-    for(i = 0; i < rows*cols; ++i){
-        m[i] = (float)rand()/RAND_MAX;
-    }
-    return m;
-}
-
-void time_random_matrix(int TA, int TB, int m, int k, int n)
-{
-    float *a;
-    if(!TA) a = random_matrix(m,k);
-    else a = random_matrix(k,m);
-    int lda = (!TA)?k:m;
-    float *b;
-    if(!TB) b = random_matrix(k,n);
-    else b = random_matrix(n,k);
-    int ldb = (!TB)?n:k;
-
-    float *c = random_matrix(m,n);
-    int i;
-    clock_t start = clock(), end;
-    for(i = 0; i<10; ++i){
-        gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
-    }
-    end = clock();
-    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
-    free(a);
-    free(b);
-    free(c);
-}
-
-
-void gemm(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    gemm_cpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-}
-
-void gemm_nn(int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float *C, int ldc)
-{
-    int i,j,k;
-    #pragma omp parallel for
-    for(i = 0; i < M; ++i){
-        for(k = 0; k < K; ++k){
-            register float A_PART = ALPHA*A[i*lda+k];
-            for(j = 0; j < N; ++j){
-                C[i*ldc+j] += A_PART*B[k*ldb+j];
-            }
-        }
-    }
-}
-
-void gemm_nt(int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float *C, int ldc)
-{
-    int i,j,k;
-    #pragma omp parallel for
-    for(i = 0; i < M; ++i){
-        for(j = 0; j < N; ++j){
-            register float sum = 0;
-            for(k = 0; k < K; ++k){
-                sum += ALPHA*A[i*lda+k]*B[j*ldb + k];
-            }
-            C[i*ldc+j] += sum;
-        }
-    }
-}
-
-void gemm_tn(int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float *C, int ldc)
-{
-    int i,j,k;
-    #pragma omp parallel for
-    for(i = 0; i < M; ++i){
-        for(k = 0; k < K; ++k){
-            register float A_PART = ALPHA*A[k*lda+i];
-            for(j = 0; j < N; ++j){
-                C[i*ldc+j] += A_PART*B[k*ldb+j];
-            }
-        }
-    }
-}
-
-void gemm_tt(int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float *C, int ldc)
-{
-    int i,j,k;
-    #pragma omp parallel for
-    for(i = 0; i < M; ++i){
-        for(j = 0; j < N; ++j){
-            register float sum = 0;
-            for(k = 0; k < K; ++k){
-                sum += ALPHA*A[i+k*lda]*B[k+j*ldb];
-            }
-            C[i*ldc+j] += sum;
-        }
-    }
-}
-
-
-void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
-    int i, j;
-    for(i = 0; i < M; ++i){
-        for(j = 0; j < N; ++j){
-            C[i*ldc + j] *= BETA;
-        }
-    }
-    if(!TA && !TB)
-        gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
-    else if(TA && !TB)
-        gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
-    else if(!TA && TB)
-        gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
-    else
-        gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
-}
-
-#ifdef GPU
-
-#include <math.h>
-
-void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A_gpu, int lda, 
-        float *B_gpu, int ldb,
-        float BETA,
-        float *C_gpu, int ldc)
-{
-    cublasHandle_t handle = blas_handle();
-    cudaError_t status = cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N), 
-            (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
-    check_error(status);
-}
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-
-void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
-{
-    float *a;
-    if(!TA) a = random_matrix(m,k);
-    else a = random_matrix(k,m);
-    int lda = (!TA)?k:m;
-    float *b;
-    if(!TB) b = random_matrix(k,n);
-    else b = random_matrix(n,k);
-    int ldb = (!TB)?n:k;
-
-    float *c = random_matrix(m,n);
-    int i;
-    clock_t start = clock(), end;
-    for(i = 0; i<32; ++i){
-        gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
-    }
-    end = clock();
-    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
-    free(a);
-    free(b);
-    free(c);
-}
-
-void time_gpu(int TA, int TB, int m, int k, int n)
-{
-    int iter = 10;
-    float *a = random_matrix(m,k);
-    float *b = random_matrix(k,n);
-
-    int lda = (!TA)?k:m;
-    int ldb = (!TB)?n:k;
-
-    float *c = random_matrix(m,n);
-
-    float *a_cl = cuda_make_array(a, m*k);
-    float *b_cl = cuda_make_array(b, k*n);
-    float *c_cl = cuda_make_array(c, m*n);
-
-    int i;
-    clock_t start = clock(), end;
-    for(i = 0; i<iter; ++i){
-        gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
-        cudaThreadSynchronize();
-    }
-    double flop = ((double)m)*n*(2.*k + 2.)*iter;
-    double gflop = flop/pow(10., 9);
-    end = clock();
-    double seconds = sec(end-start);
-    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
-    cuda_free(a_cl);
-    cuda_free(b_cl);
-    cuda_free(c_cl);
-    free(a);
-    free(b);
-    free(c);
-}
-
-
-void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
-{
-    srand(0);
-    float *a;
-    if(!TA) a = random_matrix(m,k);
-    else a = random_matrix(k,m);
-    int lda = (!TA)?k:m;
-    float *b;
-    if(!TB) b = random_matrix(k,n);
-    else b = random_matrix(n,k);
-    int ldb = (!TB)?n:k;
-
-    float *c = random_matrix(m,n);
-    float *c_gpu = random_matrix(m,n);
-    memset(c, 0, m*n*sizeof(float));
-    memset(c_gpu, 0, m*n*sizeof(float));
-    int i;
-    //pm(m,k,b);
-    gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n);
-    //printf("GPU\n");
-    //pm(m, n, c_gpu);
-
-    gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
-    //printf("\n\nCPU\n");
-    //pm(m, n, c);
-    double sse = 0;
-    for(i = 0; i < m*n; ++i) {
-        //printf("%f %f\n", c[i], c_gpu[i]);
-        sse += pow(c[i]-c_gpu[i], 2);
-    }
-    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n));
-    free(a);
-    free(b);
-    free(c);
-    free(c_gpu);
-}
-
-int test_gpu_blas()
-{
-    /*
-       test_gpu_accuracy(0,0,10,576,75); 
-
-       test_gpu_accuracy(0,0,17,10,10); 
-       test_gpu_accuracy(1,0,17,10,10); 
-       test_gpu_accuracy(0,1,17,10,10); 
-       test_gpu_accuracy(1,1,17,10,10); 
-
-       test_gpu_accuracy(0,0,1000,10,100); 
-       test_gpu_accuracy(1,0,1000,10,100); 
-       test_gpu_accuracy(0,1,1000,10,100); 
-       test_gpu_accuracy(1,1,1000,10,100); 
-
-       test_gpu_accuracy(0,0,10,10,10); 
-
-       time_gpu(0,0,64,2916,363); 
-       time_gpu(0,0,64,2916,363); 
-       time_gpu(0,0,64,2916,363); 
-       time_gpu(0,0,192,729,1600); 
-       time_gpu(0,0,384,196,1728); 
-       time_gpu(0,0,256,196,3456); 
-       time_gpu(0,0,256,196,2304); 
-       time_gpu(0,0,128,4096,12544); 
-       time_gpu(0,0,128,4096,4096); 
-     */
-    time_gpu(0,0,64,75,12544); 
-    time_gpu(0,0,64,75,12544); 
-    time_gpu(0,0,64,75,12544); 
-    time_gpu(0,0,64,576,12544); 
-    time_gpu(0,0,256,2304,784); 
-    time_gpu(1,1,2304,256,784); 
-    time_gpu(0,0,512,4608,196); 
-    time_gpu(1,1,4608,512,196); 
-
-    return 0;
-}
-#endif
-
--- a/src/gemm.cpp
+++ b/src/gemm.cpp
@ -0,0 +1,321 @@
+#include "gemm.h"
+#include "utils.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+void gemm_bin(int M, int N, int K, float ALPHA,
+              char *A, int lda,
+              float *B, int ldb,
+              float *C, int ldc) {
+    int i, j, k;
+    for (i = 0; i < M; ++i) {
+        for (k = 0; k < K; ++k) {
+            char A_PART = A[i * lda + k];
+            if (A_PART) {
+                for (j = 0; j < N; ++j) {
+                    C[i * ldc + j] += B[k * ldb + j];
+                }
+            } else {
+                for (j = 0; j < N; ++j) {
+                    C[i * ldc + j] -= B[k * ldb + j];
+                }
+            }
+        }
+    }
+}
+
+float *random_matrix(int rows, int cols) {
+    int i;
+    float *m = (float*)calloc(rows * cols, sizeof(float));
+    for (i = 0; i < rows * cols; ++i) {
+        m[i] = (float) rand() / RAND_MAX;
+    }
+    return m;
+}
+
+void time_random_matrix(int TA, int TB, int m, int k, int n) {
+    float *a;
+    if (!TA) a = random_matrix(m, k);
+    else a = random_matrix(k, m);
+    int lda = (!TA) ? k : m;
+    float *b;
+    if (!TB) b = random_matrix(k, n);
+    else b = random_matrix(n, k);
+    int ldb = (!TB) ? n : k;
+
+    float *c = random_matrix(m, n);
+    int i;
+    clock_t start = clock(), end;
+    for (i = 0; i < 10; ++i) {
+        gemm_cpu(TA, TB, m, n, k, 1, a, lda, b, ldb, 1, c, n);
+    }
+    end = clock();
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n", m, k, k, n, TA, TB,
+           (float) (end - start) / CLOCKS_PER_SEC);
+    free(a);
+    free(b);
+    free(c);
+}
+
+
+void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
+          float *A, int lda,
+          float *B, int ldb,
+          float BETA,
+          float *C, int ldc) {
+    gemm_cpu(TA, TB, M, N, K, ALPHA, A, lda, B, ldb, BETA, C, ldc);
+}
+
+void gemm_nn(int M, int N, int K, float ALPHA,
+             float *A, int lda,
+             float *B, int ldb,
+             float *C, int ldc) {
+    int i, j, k;
+#pragma omp parallel for
+    for (i = 0; i < M; ++i) {
+        for (k = 0; k < K; ++k) {
+            register float A_PART = ALPHA * A[i * lda + k];
+            for (j = 0; j < N; ++j) {
+                C[i * ldc + j] += A_PART * B[k * ldb + j];
+            }
+        }
+    }
+}
+
+void gemm_nt(int M, int N, int K, float ALPHA,
+             float *A, int lda,
+             float *B, int ldb,
+             float *C, int ldc) {
+    int i, j, k;
+#pragma omp parallel for
+    for (i = 0; i < M; ++i) {
+        for (j = 0; j < N; ++j) {
+            register float sum = 0;
+            for (k = 0; k < K; ++k) {
+                sum += ALPHA * A[i * lda + k] * B[j * ldb + k];
+            }
+            C[i * ldc + j] += sum;
+        }
+    }
+}
+
+void gemm_tn(int M, int N, int K, float ALPHA,
+             float *A, int lda,
+             float *B, int ldb,
+             float *C, int ldc) {
+    int i, j, k;
+#pragma omp parallel for
+    for (i = 0; i < M; ++i) {
+        for (k = 0; k < K; ++k) {
+            register float A_PART = ALPHA * A[k * lda + i];
+            for (j = 0; j < N; ++j) {
+                C[i * ldc + j] += A_PART * B[k * ldb + j];
+            }
+        }
+    }
+}
+
+void gemm_tt(int M, int N, int K, float ALPHA,
+             float *A, int lda,
+             float *B, int ldb,
+             float *C, int ldc) {
+    int i, j, k;
+#pragma omp parallel for
+    for (i = 0; i < M; ++i) {
+        for (j = 0; j < N; ++j) {
+            register float sum = 0;
+            for (k = 0; k < K; ++k) {
+                sum += ALPHA * A[i + k * lda] * B[k + j * ldb];
+            }
+            C[i * ldc + j] += sum;
+        }
+    }
+}
+
+
+void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
+              float *A, int lda,
+              float *B, int ldb,
+              float BETA,
+              float *C, int ldc) {
+    //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
+    int i, j;
+    for (i = 0; i < M; ++i) {
+        for (j = 0; j < N; ++j) {
+            C[i * ldc + j] *= BETA;
+        }
+    }
+    if (!TA && !TB)
+        gemm_nn(M, N, K, ALPHA, A, lda, B, ldb, C, ldc);
+    else if (TA && !TB)
+        gemm_tn(M, N, K, ALPHA, A, lda, B, ldb, C, ldc);
+    else if (!TA && TB)
+        gemm_nt(M, N, K, ALPHA, A, lda, B, ldb, C, ldc);
+    else
+        gemm_tt(M, N, K, ALPHA, A, lda, B, ldb, C, ldc);
+}
+
+#include "gemm.h"
+#include "utils.h"
+
+#ifdef GPU
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "hip/hip_runtime.h"
+#include "hiprand.h"
+#include "hipblas.h"
+
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
+              float *A_gpu, int lda,
+              float *B_gpu, int ldb,
+              float BETA,
+              float *C_gpu, int ldc)
+{
+    hipblasHandle_t handle = blas_handle();
+    hipblasStatus_t status = hipblasSgemm(handle, (TB ? HIPBLAS_OP_T : HIPBLAS_OP_N),
+                                     (TA ? HIPBLAS_OP_T : HIPBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
+//    check_error(status);
+}
+
+
+
+void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
+{
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<32; ++i){
+        gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    }
+    end = clock();
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
+    free(a);
+    free(b);
+    free(c);
+}
+
+void time_gpu(int TA, int TB, int m, int k, int n)
+{
+    int iter = 10;
+    float *a = random_matrix(m,k);
+    float *b = random_matrix(k,n);
+
+    int lda = (!TA)?k:m;
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+
+    float *a_cl = cuda_make_array(a, m*k);
+    float *b_cl = cuda_make_array(b, k*n);
+    float *c_cl = cuda_make_array(c, m*n);
+
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<iter; ++i){
+        gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
+        hipDeviceSynchronize();
+    }
+    double flop = ((double)m)*n*(2.*k + 2.)*iter;
+    double gflop = flop/pow(10., 9);
+    end = clock();
+    double seconds = sec(end-start);
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
+    cuda_free(a_cl);
+    cuda_free(b_cl);
+    cuda_free(c_cl);
+    free(a);
+    free(b);
+    free(c);
+}
+
+
+void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
+{
+    srand(0);
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    float *c_gpu = random_matrix(m,n);
+    memset(c, 0, m*n*sizeof(float));
+    memset(c_gpu, 0, m*n*sizeof(float));
+    int i;
+    //pm(m,k,b);
+    gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n);
+    //printf("GPU\n");
+    //pm(m, n, c_gpu);
+
+    gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    //printf("\n\nCPU\n");
+    //pm(m, n, c);
+    double sse = 0;
+    for(i = 0; i < m*n; ++i) {
+        //printf("%f %f\n", c[i], c_gpu[i]);
+        sse += pow(c[i]-c_gpu[i], 2);
+    }
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n));
+    free(a);
+    free(b);
+    free(c);
+    free(c_gpu);
+}
+
+int test_gpu_blas()
+{
+    /*
+       test_gpu_accuracy(0,0,10,576,75);
+
+       test_gpu_accuracy(0,0,17,10,10);
+       test_gpu_accuracy(1,0,17,10,10);
+       test_gpu_accuracy(0,1,17,10,10);
+       test_gpu_accuracy(1,1,17,10,10);
+
+       test_gpu_accuracy(0,0,1000,10,100);
+       test_gpu_accuracy(1,0,1000,10,100);
+       test_gpu_accuracy(0,1,1000,10,100);
+       test_gpu_accuracy(1,1,1000,10,100);
+
+       test_gpu_accuracy(0,0,10,10,10);
+
+       time_gpu(0,0,64,2916,363);
+       time_gpu(0,0,64,2916,363);
+       time_gpu(0,0,64,2916,363);
+       time_gpu(0,0,192,729,1600);
+       time_gpu(0,0,384,196,1728);
+       time_gpu(0,0,256,196,3456);
+       time_gpu(0,0,256,196,2304);
+       time_gpu(0,0,128,4096,12544);
+       time_gpu(0,0,128,4096,4096);
+     */
+    time_gpu(0,0,64,75,12544);
+    time_gpu(0,0,64,75,12544);
+    time_gpu(0,0,64,75,12544);
+    time_gpu(0,0,64,576,12544);
+    time_gpu(0,0,256,2304,784);
+    time_gpu(1,1,2304,256,784);
+    time_gpu(0,0,512,4608,196);
+    time_gpu(1,1,4608,512,196);
+
+    return 0;
+}
+#endif
+
--- a/src/gemm.h
+++ b/src/gemm.h
@ -1,22 +1,22 @@
 #ifndef GEMM_H
 #define GEMM_H

-void gemm_bin(int M, int N, int K, float ALPHA, 
-        char  *A, int lda, 
-        float *B, int ldb,
-        float *C, int ldc);
-        
-void gemm(int TA, int TB, int M, int N, int K, float ALPHA, 
-                    float *A, int lda, 
-                    float *B, int ldb,
-                    float BETA,
-                    float *C, int ldc);
+void gemm_bin(int M, int N, int K, float ALPHA,
+              char *A, int lda,
+              float *B, int ldb,
+              float *C, int ldc);

-void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc);
+void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
+          float *A, int lda,
+          float *B, int ldb,
+          float BETA,
+          float *C, int ldc);
+
+void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
+              float *A, int lda,
+              float *B, int ldb,
+              float BETA,
+              float *C, int ldc);

 #ifdef GPU
 void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
--- a/src/gemm_kernels.cu
+++ b/src/gemm_kernels.cu
@ -0,0 +1,163 @@
+#include "gemm.h"
+#include "utils.h"
+
+#ifdef GPU
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "hip/hip_runtime.h"
+#include "hiprand.h"
+#include "hipblas.h"
+
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
+        float *A_gpu, int lda, 
+        float *B_gpu, int ldb,
+        float BETA,
+        float *C_gpu, int ldc)
+{
+    hipblasHandle_t handle = blas_handle();
+    hipError_t status = hipblasSgemm(handle, (TB ? HIPBLAS_OP_T : HIPBLAS_OP_N), 
+            (TA ? HIPBLAS_OP_T : HIPBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
+    check_error(status);
+}
+
+
+
+void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
+{
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<32; ++i){
+        gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    }
+    end = clock();
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
+    free(a);
+    free(b);
+    free(c);
+}
+
+void time_gpu(int TA, int TB, int m, int k, int n)
+{
+    int iter = 10;
+    float *a = random_matrix(m,k);
+    float *b = random_matrix(k,n);
+
+    int lda = (!TA)?k:m;
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+
+    float *a_cl = cuda_make_array(a, m*k);
+    float *b_cl = cuda_make_array(b, k*n);
+    float *c_cl = cuda_make_array(c, m*n);
+
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<iter; ++i){
+        gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
+        hipDeviceSynchronize();
+    }
+    double flop = ((double)m)*n*(2.*k + 2.)*iter;
+    double gflop = flop/pow(10., 9);
+    end = clock();
+    double seconds = sec(end-start);
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
+    cuda_free(a_cl);
+    cuda_free(b_cl);
+    cuda_free(c_cl);
+    free(a);
+    free(b);
+    free(c);
+}
+
+
+void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
+{
+    srand(0);
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    float *c_gpu = random_matrix(m,n);
+    memset(c, 0, m*n*sizeof(float));
+    memset(c_gpu, 0, m*n*sizeof(float));
+    int i;
+    //pm(m,k,b);
+    gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n);
+    //printf("GPU\n");
+    //pm(m, n, c_gpu);
+
+    gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    //printf("\n\nCPU\n");
+    //pm(m, n, c);
+    double sse = 0;
+    for(i = 0; i < m*n; ++i) {
+        //printf("%f %f\n", c[i], c_gpu[i]);
+        sse += pow(c[i]-c_gpu[i], 2);
+    }
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n));
+    free(a);
+    free(b);
+    free(c);
+    free(c_gpu);
+}
+
+int test_gpu_blas()
+{
+    /*
+       test_gpu_accuracy(0,0,10,576,75); 
+
+       test_gpu_accuracy(0,0,17,10,10); 
+       test_gpu_accuracy(1,0,17,10,10); 
+       test_gpu_accuracy(0,1,17,10,10); 
+       test_gpu_accuracy(1,1,17,10,10); 
+
+       test_gpu_accuracy(0,0,1000,10,100); 
+       test_gpu_accuracy(1,0,1000,10,100); 
+       test_gpu_accuracy(0,1,1000,10,100); 
+       test_gpu_accuracy(1,1,1000,10,100); 
+
+       test_gpu_accuracy(0,0,10,10,10); 
+
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,192,729,1600); 
+       time_gpu(0,0,384,196,1728); 
+       time_gpu(0,0,256,196,3456); 
+       time_gpu(0,0,256,196,2304); 
+       time_gpu(0,0,128,4096,12544); 
+       time_gpu(0,0,128,4096,4096); 
+     */
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,576,12544); 
+    time_gpu(0,0,256,2304,784); 
+    time_gpu(1,1,2304,256,784); 
+    time_gpu(0,0,512,4608,196); 
+    time_gpu(1,1,4608,512,196); 
+
+    return 0;
+}
+#endif
+
--- a/src/gemm_kernels.cu.bak
+++ b/src/gemm_kernels.cu.bak
@ -0,0 +1,163 @@
+#include "gemm.h"
+#include "utils.h"
+
+#ifdef GPU
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "hip/hip_runtime.h"
+#include "hiprand.h"
+#include "hipblas.h"
+
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
+        float *A_gpu, int lda, 
+        float *B_gpu, int ldb,
+        float BETA,
+        float *C_gpu, int ldc)
+{
+    hipblasHandle_t handle = blas_handle();
+    hipError_t status = hipblasSgemm(handle, (TB ? HIPBLAS_OP_T : HIPBLAS_OP_N), 
+            (TA ? HIPBLAS_OP_T : HIPBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
+    check_error(status);
+}
+
+
+
+void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
+{
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<32; ++i){
+        gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    }
+    end = clock();
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
+    free(a);
+    free(b);
+    free(c);
+}
+
+void time_gpu(int TA, int TB, int m, int k, int n)
+{
+    int iter = 10;
+    float *a = random_matrix(m,k);
+    float *b = random_matrix(k,n);
+
+    int lda = (!TA)?k:m;
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+
+    float *a_cl = cuda_make_array(a, m*k);
+    float *b_cl = cuda_make_array(b, k*n);
+    float *c_cl = cuda_make_array(c, m*n);
+
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<iter; ++i){
+        gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
+        hipDeviceSynchronize();
+    }
+    double flop = ((double)m)*n*(2.*k + 2.)*iter;
+    double gflop = flop/pow(10., 9);
+    end = clock();
+    double seconds = sec(end-start);
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
+    cuda_free(a_cl);
+    cuda_free(b_cl);
+    cuda_free(c_cl);
+    free(a);
+    free(b);
+    free(c);
+}
+
+
+void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
+{
+    srand(0);
+    float *a;
+    if(!TA) a = random_matrix(m,k);
+    else a = random_matrix(k,m);
+    int lda = (!TA)?k:m;
+    float *b;
+    if(!TB) b = random_matrix(k,n);
+    else b = random_matrix(n,k);
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+    float *c_gpu = random_matrix(m,n);
+    memset(c, 0, m*n*sizeof(float));
+    memset(c_gpu, 0, m*n*sizeof(float));
+    int i;
+    //pm(m,k,b);
+    gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n);
+    //printf("GPU\n");
+    //pm(m, n, c_gpu);
+
+    gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
+    //printf("\n\nCPU\n");
+    //pm(m, n, c);
+    double sse = 0;
+    for(i = 0; i < m*n; ++i) {
+        //printf("%f %f\n", c[i], c_gpu[i]);
+        sse += pow(c[i]-c_gpu[i], 2);
+    }
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n));
+    free(a);
+    free(b);
+    free(c);
+    free(c_gpu);
+}
+
+int test_gpu_blas()
+{
+    /*
+       test_gpu_accuracy(0,0,10,576,75); 
+
+       test_gpu_accuracy(0,0,17,10,10); 
+       test_gpu_accuracy(1,0,17,10,10); 
+       test_gpu_accuracy(0,1,17,10,10); 
+       test_gpu_accuracy(1,1,17,10,10); 
+
+       test_gpu_accuracy(0,0,1000,10,100); 
+       test_gpu_accuracy(1,0,1000,10,100); 
+       test_gpu_accuracy(0,1,1000,10,100); 
+       test_gpu_accuracy(1,1,1000,10,100); 
+
+       test_gpu_accuracy(0,0,10,10,10); 
+
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,192,729,1600); 
+       time_gpu(0,0,384,196,1728); 
+       time_gpu(0,0,256,196,3456); 
+       time_gpu(0,0,256,196,2304); 
+       time_gpu(0,0,128,4096,12544); 
+       time_gpu(0,0,128,4096,4096); 
+     */
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,576,12544); 
+    time_gpu(0,0,256,2304,784); 
+    time_gpu(1,1,2304,256,784); 
+    time_gpu(0,0,512,4608,196); 
+    time_gpu(1,1,4608,512,196); 
+
+    return 0;
+}
+#endif
+
--- a/src/gru_layer.cpp
+++ b/src/gru_layer.cpp
@ -1,7 +1,6 @@
 #include "gru_layer.h"
 #include "connected_layer.h"
 #include "utils.h"
-#include "cuda.h"
 #include "blas.h"
 #include "gemm.h"

@ -10,9 +9,12 @@
 #include <stdlib.h>
 #include <string.h>

-static void increment_layer(layer *l, int steps)
-{
-    int num = l->outputs*l->batch*steps;
+#ifdef GPU
+#include "hip/hip_runtime.h"
+#endif
+
+static void increment_layer(layer *l, int steps) {
+    int num = l->outputs * l->batch * steps;
    l->output += num;
    l->delta += num;
    l->x += num;
@ -26,62 +28,60 @@ static void increment_layer(layer *l, int steps)
 #endif
 }

-layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
-{
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam) {
    fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
    batch = batch / steps;
-    layer l = {0};
+    layer l = {(LAYER_TYPE)0};
    l.batch = batch;
    l.type = GRU;
    l.steps = steps;
    l.inputs = inputs;

-    l.uz = malloc(sizeof(layer));
+    l.uz = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    *(l.uz) = make_connected_layer(batch * steps, inputs, outputs, LINEAR, batch_normalize, adam);
    l.uz->batch = batch;

-    l.wz = malloc(sizeof(layer));
+    l.wz = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    *(l.wz) = make_connected_layer(batch * steps, outputs, outputs, LINEAR, batch_normalize, adam);
    l.wz->batch = batch;

-    l.ur = malloc(sizeof(layer));
+    l.ur = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    *(l.ur) = make_connected_layer(batch * steps, inputs, outputs, LINEAR, batch_normalize, adam);
    l.ur->batch = batch;

-    l.wr = malloc(sizeof(layer));
+    l.wr = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    *(l.wr) = make_connected_layer(batch * steps, outputs, outputs, LINEAR, batch_normalize, adam);
    l.wr->batch = batch;


-
-    l.uh = malloc(sizeof(layer));
+    l.uh = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    *(l.uh) = make_connected_layer(batch * steps, inputs, outputs, LINEAR, batch_normalize, adam);
    l.uh->batch = batch;

-    l.wh = malloc(sizeof(layer));
+    l.wh = (layer *) malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    *(l.wh) = make_connected_layer(batch * steps, outputs, outputs, LINEAR, batch_normalize, adam);
    l.wh->batch = batch;

    l.batch_normalize = batch_normalize;


    l.outputs = outputs;
-    l.output = calloc(outputs*batch*steps, sizeof(float));
-    l.delta = calloc(outputs*batch*steps, sizeof(float));
-    l.state = calloc(outputs*batch, sizeof(float));
-    l.prev_state = calloc(outputs*batch, sizeof(float));
-    l.forgot_state = calloc(outputs*batch, sizeof(float));
-    l.forgot_delta = calloc(outputs*batch, sizeof(float));
+    l.output = (float*) calloc(outputs * batch * steps, sizeof(float));
+    l.delta = (float*) calloc(outputs * batch * steps, sizeof(float));
+    l.state = (float*) calloc(outputs * batch, sizeof(float));
+    l.prev_state = (float*) calloc(outputs * batch, sizeof(float));
+    l.forgot_state = (float*) calloc(outputs * batch, sizeof(float));
+    l.forgot_delta = (float*) calloc(outputs * batch, sizeof(float));

-    l.r_cpu = calloc(outputs*batch, sizeof(float));
-    l.z_cpu = calloc(outputs*batch, sizeof(float));
-    l.h_cpu = calloc(outputs*batch, sizeof(float));
+    l.r_cpu = (float*) calloc(outputs * batch, sizeof(float));
+    l.z_cpu = (float*) calloc(outputs * batch, sizeof(float));
+    l.h_cpu = (float*) calloc(outputs * batch, sizeof(float));

    l.forward = forward_gru_layer;
    l.backward = backward_gru_layer;
@ -103,20 +103,19 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
    l.h_gpu = cuda_make_array(0, batch*outputs);

 #ifdef CUDNN
-    cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w); 
-    cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w); 
-    cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w); 
-    cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w); 
-    cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w); 
-    cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w); 
+    hipdnnSetTensor4dDescriptor(l.uz->dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w); 
+    hipdnnSetTensor4dDescriptor(l.uh->dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w); 
+    hipdnnSetTensor4dDescriptor(l.ur->dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w); 
+    hipdnnSetTensor4dDescriptor(l.wz->dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w); 
+    hipdnnSetTensor4dDescriptor(l.wh->dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w); 
+    hipdnnSetTensor4dDescriptor(l.wr->dstTensorDesc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w); 
 #endif
 #endif

    return l;
 }

-void update_gru_layer(layer l, update_args a)
-{
+void update_gru_layer(layer l, update_args a) {
    update_connected_layer(*(l.ur), a);
    update_connected_layer(*(l.uz), a);
    update_connected_layer(*(l.uh), a);
@ -125,8 +124,7 @@ void update_gru_layer(layer l, update_args a)
    update_connected_layer(*(l.wh), a);
 }

-void forward_gru_layer(layer l, network net)
-{
+void forward_gru_layer(layer l, network net) {
    network s = net;
    s.train = net.train;
    int i;
@ -145,9 +143,9 @@ void forward_gru_layer(layer l, network net)
    fill_cpu(l.outputs * l.batch * l.steps, 0, wz.delta, 1);
    fill_cpu(l.outputs * l.batch * l.steps, 0, wr.delta, 1);
    fill_cpu(l.outputs * l.batch * l.steps, 0, wh.delta, 1);
-    if(net.train) {
+    if (net.train) {
        fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
-        copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1);
+        copy_cpu(l.outputs * l.batch, l.state, 1, l.prev_state, 1);
    }

    for (i = 0; i < l.steps; ++i) {
@ -161,36 +159,36 @@ void forward_gru_layer(layer l, network net)
        forward_connected_layer(uh, s);


-        copy_cpu(l.outputs*l.batch, uz.output, 1, l.z_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, wz.output, 1, l.z_cpu, 1);
+        copy_cpu(l.outputs * l.batch, uz.output, 1, l.z_cpu, 1);
+        axpy_cpu(l.outputs * l.batch, 1, wz.output, 1, l.z_cpu, 1);

-        copy_cpu(l.outputs*l.batch, ur.output, 1, l.r_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, wr.output, 1, l.r_cpu, 1);
+        copy_cpu(l.outputs * l.batch, ur.output, 1, l.r_cpu, 1);
+        axpy_cpu(l.outputs * l.batch, 1, wr.output, 1, l.r_cpu, 1);

-        activate_array(l.z_cpu, l.outputs*l.batch, LOGISTIC);
-        activate_array(l.r_cpu, l.outputs*l.batch, LOGISTIC);
+        activate_array(l.z_cpu, l.outputs * l.batch, LOGISTIC);
+        activate_array(l.r_cpu, l.outputs * l.batch, LOGISTIC);

-        copy_cpu(l.outputs*l.batch, l.state, 1, l.forgot_state, 1);
-        mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
+        copy_cpu(l.outputs * l.batch, l.state, 1, l.forgot_state, 1);
+        mul_cpu(l.outputs * l.batch, l.r_cpu, 1, l.forgot_state, 1);

        s.input = l.forgot_state;
        forward_connected_layer(wh, s);

-        copy_cpu(l.outputs*l.batch, uh.output, 1, l.h_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, wh.output, 1, l.h_cpu, 1);
+        copy_cpu(l.outputs * l.batch, uh.output, 1, l.h_cpu, 1);
+        axpy_cpu(l.outputs * l.batch, 1, wh.output, 1, l.h_cpu, 1);

-        if(l.tanh){
-            activate_array(l.h_cpu, l.outputs*l.batch, TANH);
+        if (l.tanh) {
+            activate_array(l.h_cpu, l.outputs * l.batch, TANH);
        } else {
-            activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
+            activate_array(l.h_cpu, l.outputs * l.batch, LOGISTIC);
        }

-        weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output);
+        weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs * l.batch, l.output);

-        copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1);
+        copy_cpu(l.outputs * l.batch, l.output, 1, l.state, 1);

-        net.input += l.inputs*l.batch;
-        l.output += l.outputs*l.batch;
+        net.input += l.inputs * l.batch;
+        l.output += l.outputs * l.batch;
        increment_layer(&uz, 1);
        increment_layer(&ur, 1);
        increment_layer(&uh, 1);
@ -201,8 +199,7 @@ void forward_gru_layer(layer l, network net)
    }
 }

-void backward_gru_layer(layer l, network net)
-{
+void backward_gru_layer(layer l, network net) {
 }

 #ifdef GPU
--- a/src/gru_layer.h
+++ b/src/gru_layer.h
@ -9,7 +9,9 @@
 layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);

 void forward_gru_layer(layer l, network state);
+
 void backward_gru_layer(layer l, network state);
+
 void update_gru_layer(layer l, update_args a);

 #ifdef GPU
--- a/Show More
+++ b/Show More